# Final Project - Tal Waitzenberg 305578189 , Lital Morali 302491709

Loading our data

In [1]:
import numpy as np
import keras 
import pandas as pd

df = pd.read_csv('./Data/AllTweets.csv', encoding = 'latin1')
train = df[['text', 'screenName']]

Using Theano backend.


Creating a function to clean the data from symbols, numbers ...

In [2]:
from bs4 import BeautifulSoup 
import re
import nltk
from nltk.corpus import stopwords # Import the stop word list

def clean_tweets(row_tweet):
    # Function to convert a raw tweet to a string of words
    # The input is a single string (a raw tweet), and 
    # the output is a single string (a preprocessed tweet)
    
    # 1. Remove twitter tags
    row_tweet = re.sub('<.*?>', '', row_tweet)
    
    # 2. remove non-letters
    row_tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', row_tweet, flags=re.MULTILINE)
    row_tweet = re.sub(r'[^\w\s\#\@]', '', row_tweet)
    row_tweet = re.sub(' +', ' ', row_tweet)
    row_tweet = re.sub(r'\d+', '', row_tweet)
    
    # 3. Convert to lower case, split into individual words
    lower_case = row_tweet.lower()               # Convert to lower case 
    words = nltk.word_tokenize(lower_case)       # Split into words
    words = lower_case.split()                   # Split by words, basic splitter without NLTK
    
    # 4. convert the stop words to a set
    stop_words = set(stopwords.words("english"))
    
    # 5. Remove stop words
    words = [w for w in words if not w in stop_words]
    
    # 5.1. Reducing inflected and derived words to their word stem, base or root form
    porter = nltk.PorterStemmer()
    lancaster = nltk.LancasterStemmer()

    meaningful_words = [porter.stem(w) for w in words]
    
    # 6. Join the words back into one string separated by space, and return the result.  
    return( " ".join( meaningful_words )) 

Cleaning the data

In [3]:
train['text'] = train['text'].apply(lambda x: clean_tweets(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Prepering the data for processing 

a. Split to training sets for each celebrity: Donald Trump, Kim Kardasihan, Katy Perry, Bill Gates, Kent Beck

In [4]:
trump_train = train[train['screenName'] == 'realDonaldTrump']
kim_train = train[train['screenName'] == 'KimKardashian']
katy_train = train[train['screenName'] == 'katyperry']
bill_train = train[train['screenName'] == 'BillGates']
kent_train = train[train['screenName'] == 'KentBeck']

b. create text data for each train of celebrity 

In [5]:
trump_text_to_train = trump_train['text'].str.cat(sep='. ')
kim_text_to_train = kim_train['text'].str.cat(sep='. ')
katy_text_to_train = katy_train['text'].str.cat(sep='. ')
bill_text_to_train = bill_train['text'].str.cat(sep='. ')
kent_text_to_train = kent_train['text'].str.cat(sep='. ')
print("Example: Donald Trump Text To Train")
print(trump_text_to_train)

Example: Donald Trump Text To Train
new rasmussen poll one accur elect trump approv ratingthat higher os #s. massiv regul cut new legisl bill sign great new scjustic infrastructur healthcar tax cut work. make america great agenda well despit distract witch hunt mani new job high busi enthusiasm. thought prayer sailor uss fitzgerald famili thank japanes alli th. rt @seanhann #hanniti start minut @newtgingrich monologu deep state alli media. back miami cubanamerican friend happi sign today anoth campaign promis forget. nation secur presidenti memorandum strengthen polici unit state toward cuba memorandum. remark presid trump polici usa toward cuba video transcript. great news #maga. investig fire fbi director man told fire fbi director witch hunt. despit phoni witch hunt go america econom amp job number great regul way job enthusiasm way. fake news media hate use turn power social media million peopl go around. month investig amp committe hear collus russian nobodi abl show proof sad. th

Preprocessing The Text
a. create the vocabulary for each celebrity tweets
    - W'll remove infrequent words. Words that appear one or two times will be removed from our vocabulary.
      We want a small vocabulary to make our model to be not slow to train.

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

def create_vocabulary(celebrity_train):
    # Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.  
    vectorizer = CountVectorizer(analyzer = "word", tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000,
                             min_df = 2) 

    vectorizer.fit_transform(celebrity_train)
    return vectorizer.get_feature_names()

trump_vocab = create_vocabulary(trump_train['text'])
kim_vocab = create_vocabulary(kim_train['text'])
katy_vocab = create_vocabulary(katy_train['text'])
bill_vocab = create_vocabulary(bill_train['text'])
kent_vocab = create_vocabulary(kent_train['text'])

trump_vocab.append('TWEETSTART')
trump_vocab.append('TWEETEND')


print("Example: Donald Trump Vocabulary")
print(trump_vocab)

Example: Donald Trump Vocabulary
['agenda', 'alli', 'america', 'americafirst', 'american', 'amp', 'around', 'back', 'bill', 'bring', 'build', 'busi', 'campaign', 'clinton', 'collus', 'comey', 'congratul', 'countri', 'cuba', 'day', 'democrat', 'despit', 'econom', 'elect', 'enthusiasm', 'fake', 'famili', 'first', 'forget', 'foxandfriend', 'friend', 'full', 'go', 'great', 'happi', 'hate', 'hear', 'high', 'hillari', 'histori', 'honor', 'hunt', 'illeg', 'incorrect', 'infrastructur', 'investig', 'ivanka', 'job', 'made', 'maga', 'mani', 'media', 'meet', 'million', 'never', 'new', 'news', 'number', 'obamacar', 'obstruct', 'obstructionist', 'one', 'passag', 'peopl', 'phoni', 'polici', 'polit', 'prayer', 'proof', 'protect', 'put', 'real', 'realdonaldtrump', 'regul', 'remark', 'rep', 'report', 'restor', 'rt', 'russian', 'sad', 'scalis', 'sign', 'start', 'state', 'steve', 'stori', 'talk', 'tax', 'th', 'thank', 'thought', 'time', 'today', 'told', 'total', 'toward', 'trump', 'us', 'way', 'well', 'wi

b. Convert special characters into strat token and end token

In [7]:
tweet_start_token = "TWEETSTART"
tweet_end_token = "TWEETEND"

def replace_special_chars(text):
    return text.replace('.',' '+ tweet_end_token + ' ' + tweet_start_token+' ' )

trump_text_to_train = replace_special_chars(trump_text_to_train)
kim_text_to_train = replace_special_chars(kim_text_to_train)
katy_text_to_train = replace_special_chars(katy_text_to_train)
bill_text_to_train = replace_special_chars(bill_text_to_train)
kent_text_to_train = replace_special_chars(kent_text_to_train)
trump_text_to_train

'new rasmussen poll one accur elect trump approv ratingthat higher os #s TWEETEND TWEETSTART  massiv regul cut new legisl bill sign great new scjustic infrastructur healthcar tax cut work TWEETEND TWEETSTART  make america great agenda well despit distract witch hunt mani new job high busi enthusiasm TWEETEND TWEETSTART  thought prayer sailor uss fitzgerald famili thank japanes alli th TWEETEND TWEETSTART  rt @seanhann #hanniti start minut @newtgingrich monologu deep state alli media TWEETEND TWEETSTART  back miami cubanamerican friend happi sign today anoth campaign promis forget TWEETEND TWEETSTART  nation secur presidenti memorandum strengthen polici unit state toward cuba memorandum TWEETEND TWEETSTART  remark presid trump polici usa toward cuba video transcript TWEETEND TWEETSTART  great news #maga TWEETEND TWEETSTART  investig fire fbi director man told fire fbi director witch hunt TWEETEND TWEETSTART  despit phoni witch hunt go america econom amp job number great regul way job en

c. We replace all word not included in our vocabularies by UNKNOWN_TOKEN

In [8]:
unknown_token = 'UNKNOWNTOKEN'   
def replace_with_unknown_token(text,vocab):
    words = text.split()
    for word in words:
        if word not in vocab:
            text = text.replace(' ' + word + ' ', ' ' + unknown_token + ' ')
    return text

trump_text_to_train = replace_with_unknown_token(trump_text_to_train, trump_vocab)
kim_text_to_train = replace_with_unknown_token(kim_text_to_train, kim_vocab)
katy_text_to_train = replace_with_unknown_token(katy_text_to_train, katy_vocab)
bill_text_to_train = replace_with_unknown_token(bill_text_to_train, bill_vocab)
kent_text_to_train = replace_with_unknown_token(kent_text_to_train, kent_vocab)
trump_text_to_train

'new UNKNOWNTOKEN UNKNOWNTOKEN one UNKNOWNTOKEN elect trump UNKNOWNTOKEN UNKNOWNTOKEN UNKNOWNTOKEN UNKNOWNTOKEN UNKNOWNTOKEN TWEETEND TWEETSTART  UNKNOWNTOKEN regul UNKNOWNTOKEN new UNKNOWNTOKEN bill sign great new UNKNOWNTOKEN infrastructur UNKNOWNTOKEN tax UNKNOWNTOKEN work TWEETEND TWEETSTART  UNKNOWNTOKEN america great agenda well despit UNKNOWNTOKEN witch hunt mani new job high busi enthusiasm TWEETEND TWEETSTART  thought prayer UNKNOWNTOKEN UNKNOWNTOKEN UNKNOWNTOKEN famili thank UNKNOWNTOKEN alli th TWEETEND TWEETSTART  rt UNKNOWNTOKEN UNKNOWNTOKEN start UNKNOWNTOKEN UNKNOWNTOKEN UNKNOWNTOKEN UNKNOWNTOKEN state alli media TWEETEND TWEETSTART  back UNKNOWNTOKEN UNKNOWNTOKEN friend happi sign today UNKNOWNTOKEN campaign UNKNOWNTOKEN forget TWEETEND TWEETSTART  UNKNOWNTOKEN UNKNOWNTOKEN UNKNOWNTOKEN UNKNOWNTOKEN UNKNOWNTOKEN polici UNKNOWNTOKEN state toward cuba UNKNOWNTOKEN TWEETEND TWEETSTART  remark UNKNOWNTOKEN trump polici UNKNOWNTOKEN toward cuba UNKNOWNTOKEN UNKNOWNTOKEN TWEE

d. We will use text_to_word_sequence to splits a tweet into a list of word for each celebrity.

In [9]:
from keras.preprocessing.text import text_to_word_sequence
trump_seq = text_to_word_sequence(trump_text_to_train, lower=False, split=" ")
kim_seq = text_to_word_sequence(kim_text_to_train, lower=False, split=" ")
katy_seq = text_to_word_sequence(katy_text_to_train, lower=False, split=" ")
bill_seq = text_to_word_sequence(bill_text_to_train, lower=False, split=" ")
kent_seq = text_to_word_sequence(kent_text_to_train, lower=False, split=" ")
trump_seq[0:10]

['new',
 'UNKNOWNTOKEN',
 'UNKNOWNTOKEN',
 'one',
 'UNKNOWNTOKEN',
 'elect',
 'trump',
 'UNKNOWNTOKEN',
 'UNKNOWNTOKEN',
 'UNKNOWNTOKEN']

e. We will create a toknizer for each celebrity and train it to get the a matrix for each celebrity 

In [10]:
#Tokenizer for vectorizing text
from keras.preprocessing.text import Tokenizer

#Donald Trump Matrix
trump_token = Tokenizer(num_words=600,char_level=False)
trump_token.fit_on_texts(trump_seq)
trump_text_mtx = trump_token.texts_to_matrix(trump_seq, mode='binary')

#Kim Kardashian Matrix
kim_token = Tokenizer(num_words=600,char_level=False)
kim_token.fit_on_texts(kim_seq)
kim_text_mtx = kim_token.texts_to_matrix(kim_seq, mode='binary')

#Katy Perry Matrix
katy_token = Tokenizer(num_words=600,char_level=False)
katy_token.fit_on_texts(katy_seq)
katy_text_mtx = katy_token.texts_to_matrix(katy_seq, mode='binary')

#Bill Gates Matrix
bill_token = Tokenizer(num_words=600,char_level=False)
bill_token.fit_on_texts(bill_seq)
bill_text_mtx = bill_token.texts_to_matrix(bill_seq, mode='binary')

#Kent Beck Matrix
kent_token = Tokenizer(num_words=600,char_level=False)
kent_token.fit_on_texts(kent_seq)
kent_text_mtx = kent_token.texts_to_matrix(kent_seq, mode='binary')

Input And Output<br />
We will create input & output for each celebrity

a. We want to predict the next word, so output will be the input matrix shifted by one row. <br />
b. checking that they both have the same number of rows.

In [11]:
#Donald Trump Input&Output
trump_input = trump_text_mtx[:-1]
trump_output = trump_text_mtx[1:]

#Kim Kardashian Input&Output
kim_input = kim_text_mtx[:-1]
kim_output = kim_text_mtx[1:]

#Katy Perry Input&Output
katy_input = katy_text_mtx[:-1]
katy_output = katy_text_mtx[1:]

#Bill Gates Input&Output
bill_input = bill_text_mtx[:-1]
bill_output = bill_text_mtx[1:]

#Kent Beck Input&Output
kent_input = kent_text_mtx[:-1]
kent_output = kent_text_mtx[1:]

trump_input.shape, trump_output.shape

((662, 600), (662, 600))

<h2>Training the Model</h2><br />
We will train 5 models for each celebrity

<h4>Donald Trump Model</h4>

In [12]:
#from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten
from keras.layers.wrappers import TimeDistributed
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM

#Turn positive integers (indexes) into dense vectors of fixed size.
trump_model = Sequential()
trump_model.add(Embedding(input_dim=trump_input.shape[0],output_dim= 42, input_length=trump_input.shape[1]))
# the model will take as input an integer matrix of size (batch, vocabulary_size).
# the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
# now model.output_shape == (None, vocabulary_size, 42), where None is the batch dimension.

#connect it to a dense output layer.
trump_model.add(Flatten())
trump_model.add(Dense(trump_output.shape[1], activation='sigmoid'))
trump_model.summary()

#training the model
trump_model.compile(loss='categorical_crossentropy', optimizer='rmsprop',metrics=["accuracy"])
trump_model.fit(trump_input, y=trump_output, batch_size=300, epochs=100, verbose=1, validation_split=0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 600, 42)           27804     
_________________________________________________________________
flatten_1 (Flatten)          (None, 25200)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 600)               15120600  
Total params: 15,148,404
Trainable params: 15,148,404
Non-trainable params: 0
_________________________________________________________________
Train on 529 samples, validate on 133 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 

Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1dfca7f1630>

<h4>Kim Kardashian Model</h4>

In [13]:
#Turn positive integers (indexes) into dense vectors of fixed size.
kim_model = Sequential()
kim_model.add(Embedding(input_dim=kim_input.shape[0],output_dim= 42, input_length=kim_input.shape[1]))
# the model will take as input an integer matrix of size (batch, vocabulary_size).
# the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
# now model.output_shape == (None, vocabulary_size, 42), where None is the batch dimension.

#connect it to a dense output layer.
kim_model.add(Flatten())
kim_model.add(Dense(kim_output.shape[1], activation='sigmoid'))
kim_model.summary()

#training the model
kim_model.compile(loss='categorical_crossentropy', optimizer='rmsprop',metrics=["accuracy"])
kim_model.fit(kim_input, y=kim_output, batch_size=300, epochs=100, verbose=1, validation_split=0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 600, 42)           18942     
_________________________________________________________________
flatten_2 (Flatten)          (None, 25200)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 600)               15120600  
Total params: 15,139,542
Trainable params: 15,139,542
Non-trainable params: 0
_________________________________________________________________
Train on 360 samples, validate on 91 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 2

Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1dfcc54c6a0>

<h4>Katy Perry Model</h4>

In [14]:
#Turn positive integers (indexes) into dense vectors of fixed size.
katy_model = Sequential()
katy_model.add(Embedding(input_dim=katy_input.shape[0],output_dim= 42, input_length=katy_input.shape[1]))
# the model will take as input an integer matrix of size (batch, vocabulary_size).
# the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
# now model.output_shape == (None, vocabulary_size, 42), where None is the batch dimension.

#connect it to a dense output layer.
katy_model.add(Flatten())
katy_model.add(Dense(katy_output.shape[1], activation='sigmoid'))
katy_model.summary()

#training the model
katy_model.compile(loss='categorical_crossentropy', optimizer='rmsprop',metrics=["accuracy"])
katy_model.fit(katy_input, y=katy_output, batch_size=300, epochs=100, verbose=1, validation_split=0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 600, 42)           25830     
_________________________________________________________________
flatten_3 (Flatten)          (None, 25200)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 600)               15120600  
Total params: 15,146,430
Trainable params: 15,146,430
Non-trainable params: 0
_________________________________________________________________
Train on 492 samples, validate on 123 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 

Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1dfcc945a90>

<h4>Bill Gates Model</h4>

In [15]:
#Turn positive integers (indexes) into dense vectors of fixed size.
bill_model = Sequential()
bill_model.add(Embedding(input_dim=bill_input.shape[0],output_dim= 42, input_length=bill_input.shape[1]))
# the model will take as input an integer matrix of size (batch, vocabulary_size).
# the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
# now model.output_shape == (None, vocabulary_size, 42), where None is the batch dimension.

#connect it to a dense output layer.
bill_model.add(Flatten())
bill_model.add(Dense(bill_output.shape[1], activation='sigmoid'))
bill_model.summary()

#training the model
bill_model.compile(loss='categorical_crossentropy', optimizer='rmsprop',metrics=["accuracy"])
bill_model.fit(bill_input, y=bill_output, batch_size=300, epochs=100, verbose=1, validation_split=0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 600, 42)           4620      
_________________________________________________________________
flatten_4 (Flatten)          (None, 25200)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 600)               15120600  
Total params: 15,125,220
Trainable params: 15,125,220
Non-trainable params: 0
_________________________________________________________________
Train on 88 samples, validate on 22 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28

Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1dfcaf4b908>

<h4>Kent Beck Model</h4>

In [16]:
#Turn positive integers (indexes) into dense vectors of fixed size.
kent_model = Sequential()
kent_model.add(Embedding(input_dim=kent_input.shape[0],output_dim= 42, input_length=kent_input.shape[1]))
# the model will take as input an integer matrix of size (batch, vocabulary_size).
# the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
# now model.output_shape == (None, vocabulary_size, 42), where None is the batch dimension.

#connect it to a dense output layer.
kent_model.add(Flatten())
kent_model.add(Dense(kent_output.shape[1], activation='sigmoid'))
kent_model.summary()

#training the model
kent_model.compile(loss='categorical_crossentropy', optimizer='rmsprop',metrics=["accuracy"])
kent_model.fit(kent_input, y=kent_output, batch_size=300, epochs=100, verbose=1, validation_split=0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 600, 42)           10668     
_________________________________________________________________
flatten_5 (Flatten)          (None, 25200)             0         
_________________________________________________________________
dense_5 (Dense)              (None, 600)               15120600  
Total params: 15,131,268
Trainable params: 15,131,268
Non-trainable params: 0
_________________________________________________________________
Train on 203 samples, validate on 51 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 2

Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1df807644e0>

<h2>Chcek The Models</h2> <br />
We will define a function that accepts a word, convert it to its one-hot representation, predict the following word using the trained model, and finally convert the predicted one-hot result into a text and returns it.

In [17]:
def get_next(text,token,model,fullmtx,fullText):
    tmp = text_to_word_sequence(text, lower=False, split=" ")
    tmp = token.texts_to_matrix(tmp, mode='binary')
    p = model.predict(tmp)
    top10 = p.argsort() [0][-10:]
    bestMatch = np.random.choice(top10,1)[0]
    next_idx = np.min(np.where(fullmtx[:,bestMatch]>0))
    return fullText[next_idx]

<h3>Function for creating tweets</h3> <br />
W'll define a function that generates 30% tweets from the data for each celebruty and also the function checks that each tweet is no more than 140 characters

In [18]:
def creatingTweets(epochs,token,model,fullMtx,fullText):
    output_text = ""
    for i in range(0,epochs-1):
        tweet =""
        word = get_next('TWEETSTART',token,model,fullMtx,fullText)
        while(word != 'TWEETEND' and len(tweet) <= (140 - (len(word) + 1))):
            if (word != 'TWEETSTART'):
                tweet += ' ' + word
            word = get_next(word,token,model,fullMtx,fullText)
        output_text += tweet + '. '
    return output_text

Now lets generate tweets for each celebrity  

In [19]:
trump_new_tweets = creatingTweets(15,trump_token,trump_model,trump_text_mtx,trump_seq)
kim_new_tweets = creatingTweets(15,kim_token,kim_model,kim_text_mtx,kim_seq)
katy_new_tweets = creatingTweets(15,katy_token,katy_model,katy_text_mtx,katy_seq)
#bill_new_tweets = creatingTweets(10,bill_token,bill_model,bill_text_mtx,bill_seq)
kent_new_tweets = creatingTweets(10,kent_token,kent_model,kent_text_mtx,kent_seq)
trump_new_tweets

' never forget wisconsin.  made fake despit news job great UNKNOWNTOKEN great agenda america hunt news time great new great UNKNOWNTOKEN witch hunt great despit great.  made. .  never job high phoni great.  rt UNKNOWNTOKEN witch.  despit phoni stori russian great number hunt mani news great.  thank wisconsin hunt great news made news great number russian.  rt.  fake despit phoni great UNKNOWNTOKEN friend happi great news fake great state.  UNKNOWNTOKEN great news report.  congratul hunt hunt UNKNOWNTOKEN.  thank news time russian UNKNOWNTOKEN UNKNOWNTOKEN UNKNOWNTOKEN friend russian news thank phoni collus.  never. '

<h4>Now let's convert our new tweets to data frame</h4>

In [60]:
def convert_tweets_to_dataframe(text):
    columns = "text"
    tweets = text.split('. ')
    df_text = pd.DataFrame(columns=columns.split(' '),data=[t for t in tweets])
    df_screen_name = pd.DataFrame(columns=['screenName','realResult'])
    df_output = pd.concat([df_text, df_screen_name])
    return df_output[:-1]

#creates data frame to each celebrity
trump_df = convert_tweets_to_dataframe(trump_new_tweets)
trump_df['realResult'] = 'realDonaldTrump'
kim_df = convert_tweets_to_dataframe(kim_new_tweets)
kim_df['realResult'] = 'KimKardashian'
katy_df = convert_tweets_to_dataframe(katy_new_tweets)
katy_df['realResult'] = 'katyperry'
kent_df = convert_tweets_to_dataframe(kent_new_tweets)
kent_df['realResult'] = 'KentBeck'

new_tweets_df = pd.concat([trump_df,kim_df,katy_df,kent_df],ignore_index=True)
new_tweets_df = new_tweets_df.replace(np.nan, '', regex=True)
new_tweets_df[0:10]

Unnamed: 0,realResult,screenName,text
0,realDonaldTrump,,remark happi peopl rt UNKNOWNTOKEN
1,realDonaldTrump,,great news job great state wisconsin great
2,realDonaldTrump,,passag amp go passag
3,realDonaldTrump,,happi great news amp UNKNOWNTOKEN great
4,realDonaldTrump,,never UNKNOWNTOKEN
5,realDonaldTrump,,rt great agenda UNKNOWNTOKEN UNKNOWNTOKEN wit...
6,realDonaldTrump,,UNKNOWNTOKEN
7,realDonaldTrump,,never
8,realDonaldTrump,,happi stori fake
9,realDonaldTrump,,congratul stori peopl great news amp great UN...


<h2>Final Project Part B</h2>

Now we will use out classification model from part B to predict the results

<h3>Data Cleaning:</h3>

Loop through and clean all of the training set at once.

In [50]:
clean_train_tweets = train['text'].apply(lambda x: clean_tweets(x))

<h4>BOW - Bag of words</h4>

In [51]:
print("Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000) 

train_data_features = vectorizer.fit_transform(clean_train_tweets)
# convert the result to an array
train_data_features = train_data_features.toarray()

print(train_data_features.shape)

Creating the bag of words...

(179, 870)


The vocabulary after the Bag of Words model is trained:

In [52]:
vocab = vectorizer.get_feature_names()
print(vocab)

['abl', 'abus', 'account', 'accur', 'acknowledg', 'act', 'ad', 'african', 'ag', 'age', 'agenda', 'agileklzkitten', 'ago', 'ahead', 'aint', 'air', 'akutienamekim', 'album', 'alechuerta', 'alli', 'almostanart', 'alreadi', 'also', 'aluminum', 'alway', 'amaz', 'america', 'americafirst', 'american', 'amp', 'amsterdam', 'andynishan', 'angel', 'annual', 'anoth', 'answer', 'anyon', 'anyth', 'apolog', 'app', 'approv', 'appétit', 'armi', 'around', 'arriv', 'art', 'ask', 'assess', 'asshol', 'aussie_kardash', 'autograph', 'avail', 'awkwardaya', 'babe', 'babi', 'back', 'bad', 'badli', 'ball', 'ban', 'beauti', 'bed', 'begin', 'behind', 'believ', 'best', 'better', 'big', 'bigbabyjonathan', 'biggi', 'bigsean', 'bike', 'bill', 'bin', 'bird', 'birthday', 'bleach', 'blend', 'bling', 'bloomberg', 'bon', 'bonu', 'book', 'box', 'br', 'braveri', 'bring', 'brush', 'budget', 'buffalo', 'build', 'busi', 'ca', 'cabinet', 'call', 'calvinharri', 'came', 'campaign', 'cant', 'capitolrecord', 'care', 'carlythekatycat

<h2>Now We will check the Results with our classification model</h2>

In [53]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier

#split to train & test
msk = np.random.rand(len(train)) < 0.8
train_x = train_data_features[msk]
test_x = train_data_features[~msk]
train_y = train.loc[msk,"screenName"]
test_y = train.loc[~msk,"screenName"]

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_x, train_y )
# Evaluate accuracy best on the test set
res = forest.score(test_x,test_y)
res


0.73170731707317072

<h2>Check Our New Tweets With Our Trained Random Forest Classifier</h2>

We creating a new vectorizer with the vocabulary of our previous one and predict who wrote each tweet

In [65]:
# Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.  
vectorizer_new = CountVectorizer(analyzer = "word", tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000,
                             vocabulary = vectorizer.vocabulary_) 

train_data_features_new = vectorizer_new.fit_transform(new_tweets_df['text'])
# convert the result to an array
train_data_features_new = train_data_features_new.toarray()

predicted_results = forest.predict(train_data_features_new)
predicted_results

array(['KimKardashian', 'realDonaldTrump', 'realDonaldTrump',
       'realDonaldTrump', 'KimKardashian', 'realDonaldTrump',
       'KimKardashian', 'KimKardashian', 'KimKardashian',
       'realDonaldTrump', 'realDonaldTrump', 'KimKardashian',
       'realDonaldTrump', 'realDonaldTrump', 'KimKardashian',
       'KimKardashian', 'KimKardashian', 'KimKardashian', 'KimKardashian',
       'KimKardashian', 'KimKardashian', 'KimKardashian', 'KimKardashian',
       'KimKardashian', 'KimKardashian', 'KimKardashian', 'KimKardashian',
       'KimKardashian', 'katyperry', 'katyperry', 'katyperry', 'katyperry',
       'katyperry', 'katyperry', 'katyperry', 'katyperry', 'katyperry',
       'katyperry', 'katyperry', 'katyperry', 'katyperry', 'katyperry',
       'KentBeck', 'KentBeck', 'KentBeck', 'KentBeck', 'KentBeck',
       'KentBeck', 'KentBeck', 'KentBeck', 'KentBeck'], dtype=object)

Now lets check the accuracy of the predicted results according to the real results

In [67]:
new_tweets_df['screenName'] = predicted_results
diff_df = new_tweets_df[new_tweets_df['screenName'] != new_tweets_df['realResult']]
accuracy = abs(new_tweets_df.shape[0] - diff_df.shape[0]) / new_tweets_df.shape[0]
print(accuracy)

0.8823529411764706


let's create the confusion matrix of our mistakes 

In [112]:
from sklearn.metrics import confusion_matrix
c_m = confusion_matrix(new_tweets_df['realResult'].tolist(),new_tweets_df['screenName'].tolist()).tolist()
a =['Name','Trump','Kim','Katy','Kent']

# add column to matrix
for i in range(0,4):
    c_m[i].reverse()
    c_m[i].append(a[i+1])
    c_m[i].reverse()

# add row to matrix
c_m.reverse()
c_m.append(a)
c_m.reverse()

print('\n'.join(['\t'.join(['{:4}'.format(item) for item in row]) for row in c_m]))

Name	Trump	Kim 	Katy	Kent
Trump	   9	   0	   0	   0
Kim 	   0	  14	   0	   0
Katy	   0	   0	  14	   0
Kent	   0	   6	   0	   8


Let's explain, each cell representing the number of times that we predicted that the tweet belongs to the celebrity name from the column and the tweet is really belogns to the celebrity name from the row. therefore the diagonal representing the number of times that we predicted the correct result.   