## Apply Word2Vec

In [1]:
import pandas as pd # provide sql-like data manipulation tools. very handy.
pd.options.mode.chained_assignment = None
import numpy as np # high dimensional vector computing library.
from copy import deepcopy
from string import punctuation
from random import shuffle

import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class
LabeledSentence = gensim.models.doc2vec.LabeledSentence # we'll talk about this down below

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import datetime,time

In [334]:
data = pd.read_csv(r"C:\Xing\cs591_project\TextMining\LabelledData.csv",
                  parse_dates = [1],encoding = "ISO-8859-1")
data['index']=data['index']-pd.Timedelta(hours=2)

In [335]:
data

Unnamed: 0.1,Unnamed: 0,index,label,permalink,text,id,username,retweets,favorites,mentions,hashtags,geo
0,1,2019-03-13 06:27:00,N,https://twitter.com/_acephale/status/110582274...,The Art Institute of Chicago holds the Mary Re...,1.110000e+18,,0,11,,,
1,2,2019-04-07 05:14:00,N,https://twitter.com/_Alexis_Marie/status/11148...,"Exciting day ahead in Chicago! 1: Cloud Gate, ...",1.110000e+18,,0,0,,,
2,3,2019-04-10 16:42:00,N,https://twitter.com/_Alexis_Marie/status/11161...,Art Institute of Chicago @ artinstitutechi . #...,1.120000e+18,,0,1,@,# # # # # # # # # # #,
3,4,2019-03-22 11:26:00,N,https://twitter.com/_isragr/status/11091593444...,"# VanGogh of the Day: Fishing in Spring, the P...",1.110000e+18,,0,0,,#,
4,5,2019-03-21 20:19:00,N,https://twitter.com/_missChicago_/status/11089...,Face Time: Rembrandts Self-Portraits | The Ar...,1.110000e+18,,0,0,,#,
5,6,2019-03-21 20:57:00,N,https://twitter.com/_NoMin92/status/1108940787...,The Art Institute of Chicago pic.twitter.com/T...,1.110000e+18,,0,0,,,
6,7,2019-03-19 16:13:00,N,https://twitter.com/1225OldTown/status/1108144...,Our proximity to public transportation makes i...,1.110000e+18,,0,1,,,
7,8,2019-03-18 15:31:00,N,https://twitter.com/192kbcom/status/1107771425...,Tortoise's TNT is weirdly beautiful and imposs...,1.110000e+18,,0,0,,,
8,9,2019-03-20 07:28:00,N,https://twitter.com/192kbcom/status/1108374713...,Watch @ marylattimore perform from the Womans...,1.110000e+18,,0,0,@ @,,
9,10,2019-03-14 08:08:00,N,https://twitter.com/19thcenturyart2/status/110...,Mary Cassatt - Mother and Child (c. 1900) - Ar...,1.110000e+18,,1,2,,# # #,


### Tokenize and clean data

In [8]:
data.iloc[1000,:]

Unnamed: 0                                                  981
index                                       2019-03-28 07:43:00
label                                                         N
permalink     https://twitter.com/arte_bot/status/1111277607...
text          Cem vistas de Edo, de Albrecht Dürer. Gravura,...
id                                                     1.11e+18
username                                                    NaN
retweets                                                      0
favorites                                                     0
mentions                                                    NaN
hashtags                                                    NaN
geo                                                         NaN
Name: 1000, dtype: object

In [5]:
def tokenize(tweet):
    try:
        # Step 0: Convert to lower case
        tweet = tweet.lower()
        
        # Step 1: Tokenize the tweets
        tokens = tokenizer.tokenize(tweet)
        
        # Step 2: Filter
        
        #tokens = filter(lambda t: not t.startswith('@'), tokens)
        #tokens = filter(lambda t: not t.startswith('#'), tokens)
        #tokens = filter(lambda t: not t.startswith('http'), tokens)
        return [i for i in tokens]
    except:
        return 'NC'

In [6]:
import tldextract

In [7]:
tldextract.extract('http://forums.news.cnn.com/').domain

'cnn'

In [14]:
def extract_DM(tweet):
    # This function will extract the domain name from 
    # Step 1: Tokenize the tweets
    tokens = tokenizer.tokenize(tweet)
    urls = filter(lambda t: t.startswith('http'), tokens)
    urls = [url for url in urls]
    DM = []
    if len(urls):
        for url in urls: 
            DM.append(tldextract.extract(url).domain)
        return DM
    else:
        return 'NC'

In [17]:
#data_Processed.tokens.map(set(['#']).issubset)
def count_string(string):
    count=[]
    for index in data_Processed.index:
        token_list=data_Processed.loc[index,'tokens']
        num_str=0
        for element in token_list:
            if string in element: 
                num_str+=1
        count.append(num_str)
    return count

In [323]:
def pre_process(data,n=1000):
    data = data.iloc[:n,:]
    data['tokens'] = data['text'].progress_map(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
    data = data[data.tokens != 'NC']
    data['username'] = data['permalink'].progress_map(lambda s:s.split('/')[3])
    data['DomainName'] = data['text'].progress_map(extract_DM)
    data['rawtext']= data['text']
    
    # Feature generation
    data_Processed = data[['index','retweets','tokens','rawtext','favorites','mentions','username','DomainName']]
    data_Processed['hour'] = data_Processed['index'].progress_map(lambda s:s.hour)
    data_Processed['min'] = data_Processed['index'].progress_map(lambda s:s.minute)
    data_Processed['dayOfWeek'] = data_Processed['index'].progress_map(lambda s:s.dayofweek)
    

    return data_Processed

### Feature
* url: using module: "tldextract"
* reply
* uername: only for extract occupany profile

In [336]:
data_Processed = pre_process(data)
data_Processed['DomainName']=data_Processed['DomainName'].apply(lambda x: ''.join(map(str, x)))
data_Processed['DomainName'] = np.where(((data_Processed.DomainName == 'instagram')|(data_Processed.DomainName == 'facebook')|(data_Processed.DomainName == 'swarmapp')), 1, 0)
data_Processed['OpenHour']=np.where(((data_Processed['index'].dt.time>datetime.time(hour=10,minute=30))&(data_Processed['index'].dt.hour<17)&(data_Processed['index'].dt.dayofweek!=3))|((data_Processed['index'].dt.time>datetime.time(hour=10,minute=30))&(data_Processed['index'].dt.hour<20)&(data_Processed['index'].dt.dayofweek==3)),1,0)
data_Processed['hashtags']=count_string('#')
data_Processed['mentions']=count_string('@')
data_Processed['username']=data_Processed['username'].str.lower()
Boolean_username=data_Processed['username'].str.contains('art|chicago|archae|news|museum|chicago|book|studio|info|photo|media', regex=True)
data_Processed['username'] = np.where(Boolean_username==True, 0, 1)

progress-bar: 100%|█████████████████████| 1000/1000 [00:00<00:00, 15150.04it/s]
progress-bar: 100%|████████████████████| 1000/1000 [00:00<00:00, 999834.09it/s]
progress-bar: 100%|█████████████████████| 1000/1000 [00:00<00:00, 12819.21it/s]
progress-bar: 100%|████████████████████| 1000/1000 [00:00<00:00, 249958.52it/s]
progress-bar: 100%|████████████████████| 1000/1000 [00:00<00:00, 249988.32it/s]
progress-bar: 100%|████████████████████| 1000/1000 [00:00<00:00, 249958.52it/s]


In [21]:
### Domain name: only use the top 10 most frequent domain
Top10Domain = data_Processed['DomainName'].value_counts().index[1:11]
#for Domain in Top10Domain:
 #   data_Processed[Domain[0]] = data_Processed['DomainName'].progress_map(lambda s: Domain in s)

In [22]:
Top10Domain

Int64Index([1], dtype='int64')

### Word2Vec

In [337]:
# Divide dataset into train set and test set
x_train,x_test, y_train,y_test = train_test_split(data_Processed,np.array(data['label'][:1000]),
                                                                                     test_size = 0.2,random_state=42)

In [24]:
n_dim = 300

In [25]:
def w2v_load_model(fname):
    from gensim.models.keyedvectors import KeyedVectors
    return KeyedVectors.load_word2vec_format(fname, binary=True)

In [26]:
def txt2vectors(self,words):
    """
    Convert input text into an iterator that returns the corresponding vector representation of each
    word in the text, if it exists in the Word2Vec model
    :param words: input words
    :param is_html: if True, then extract the text from the input HTML
    :return: iterator of vectors created from the words in the text using the Word2Vec model.
    """
    words = [w for w in words if w in self._model]
    if len(words) != 0:
        for w in words:
            yield self._model[w]

In [27]:
# Load model
w2v_model = w2v_load_model(r"C:\Xing\cs591_project\GoogleNews-vectors-negative300-002.bin")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [32]:
w2v_model.most_similar('good')

MemoryError: 

#### tweets2vectors: M1:  Average

#### tweets2vectors: M2:  TF-IDF

In [28]:
print('building tf-idf matrix....')
vectorizer = TfidfVectorizer(analyzer= lambda x:x,min_df = 10)

matrix = vectorizer.fit_transform(data_Processed.tokens)
tfidf = dict(zip(vectorizer.get_feature_names(),vectorizer.idf_))
print('vocab size:',len(tfidf))

building tf-idf matrix....
vocab size: 249


In [29]:
def buildWordVector(tokens,size,method = 1):
    vec = np.zeros(size).reshape((1,size))
    count = 0
    for word in tokens:
        try:
            if method == 1:
                vec += w2v_model[word].reshape((1,size))*tfidf[word]
            elif method == 0:
                vec += w2v_model[word].reshape((1,size))
            count += 1
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [168]:
from sklearn.preprocessing import scale
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(x_train.tokens)])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(x_test.tokens)])
test_vecs_w2v = scale(test_vecs_w2v)

100%|██████████████████████████████████████| 800/800 [00:00<00:00, 2919.42it/s]
100%|██████████████████████████████████████| 200/200 [00:00<00:00, 1249.87it/s]


In [31]:
#importing bokeh library for interactive dataviz
import bokeh.plotting as bp
from bokeh.models import HoverTool,BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

## Model Building

**Note**:
1. Add a pre-selection rule: if this observation is not in office hour, then it will be labled as "N" directly

### Building a sentiment Classifier using Keras

NOw we are ready to feed these vectors into a NN classifier

In [32]:
from keras.models import Sequential
from keras.layers import Dense, LSTM

Using TensorFlow backend.


In [373]:
model = Sequential()
model.add(Dense(32,activation ='relu',input_dim = 309))
model.add(Dense(1,activation = 'sigmoid'))
model.compile(optimizer = 'rmsprop',
             loss = 'binary_crossentropy',
             metrics = ['accuracy'])

In [374]:
x_train_withNAN = x_train[['retweets','favorites','hour','dayOfWeek','DomainName','OpenHour','mentions','hashtags','username']].join(pd.DataFrame(train_vecs_w2v))
x_test_withNAN = x_test[['retweets','favorites','hour','dayOfWeek','DomainName','OpenHour','mentions','hashtags','username']].join(pd.DataFrame(test_vecs_w2v))

#x_train_withNAN = pd.DataFrame(train_vecs_w2v).iloc[list(x_train.index),:]
#x_test_withNAN = pd.DataFrame(test_vecs_w2v).iloc[list(x_test.index),:]
x_train_all = x_train_withNAN[np.invert(np.isnan(x_train_withNAN[0]))]
y_train_all = y_train[np.invert(np.isnan(x_train_withNAN[0]))]

x_test_all = x_test_withNAN[np.invert(np.isnan(x_test_withNAN[0]))]
y_test_all = y_test[np.invert(np.isnan(x_test_withNAN[0]))]

y_train_all[y_train_all=='N']=0
y_train_all[y_train_all=='Y']=1

y_test_all[y_test_all=='N']=0
y_test_all[y_test_all=='Y']=1


In [375]:
model.fit(x_train_all,y_train_all,epochs = 10,batch_size=10,verbose = 2)

Epoch 1/10
 - 1s - loss: 0.6566 - acc: 0.7112
Epoch 2/10
 - 0s - loss: 0.5402 - acc: 0.7671
Epoch 3/10
 - 0s - loss: 0.4612 - acc: 0.8121
Epoch 4/10
 - 0s - loss: 0.4087 - acc: 0.8432
Epoch 5/10
 - 0s - loss: 0.3810 - acc: 0.8634
Epoch 6/10
 - 0s - loss: 0.3396 - acc: 0.8866
Epoch 7/10
 - 0s - loss: 0.3212 - acc: 0.9037
Epoch 8/10
 - 0s - loss: 0.2911 - acc: 0.9099
Epoch 9/10
 - 0s - loss: 0.2720 - acc: 0.9193
Epoch 10/10
 - 0s - loss: 0.2343 - acc: 0.9301


<keras.callbacks.History at 0x6581af98>

In [341]:
y_test_all

array([0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype=object)

In [342]:
score = model.evaluate(x_train_all,y_train_all,batch_size=10,verbose = 2)
print(score[1])
score = model.evaluate(x_test_all,y_test_all,batch_size=10,verbose = 2)
print(score[1])

0.9565217324665615
0.8787878679506707


In [40]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [400]:
yhat_probs = model.predict(x_test_all, verbose=0)
yhat_probs = yhat_probs[:, 0]
#yhat_classes = model.predict_classes(x_test_all, verbose=0)
yhat_classes=np.where(yhat_probs > 0.48, 1, 0)
y_label=y_test_all.astype('int32') 
yhat_classes

array([0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0])

In [401]:
df=pd.DataFrame(data.iloc[x_test_all.index.values,:][['index','text']]).join(pd.DataFrame(data_Processed.iloc[x_test_all.index.values,:][['OpenHour']]))
for i,ele in enumerate(list(df['OpenHour'])):
    if list(df['OpenHour'])[i]==0:
        yhat_classes[i]=0
yhat_classes

array([0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0])

In [402]:
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_label, yhat_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)5
precision = precision_score(y_label, yhat_classes)
print('Precision: %f' % precision)
recall = recall_score(y_label, yhat_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_label, yhat_classes)
print('F1 score: %f' % f1)


Accuracy: 0.909091
Precision: 0.750000
Recall: 0.857143
F1 score: 0.800000


In [393]:
#comparison list of the result

df=pd.DataFrame(data.iloc[x_test_all.index.values,:][['index','text']]).join(pd.DataFrame(data_Processed.iloc[x_test_all.index.values,:][['OpenHour']]))
df['label']=y_test_all
df['predict']=yhat_classes
df['predictvalue']=yhat_probs
df

Unnamed: 0,index,text,OpenHour,label,predict,predictvalue
136,2019-04-01 03:20:00,Art Institute of Chicago will be hosting Gregg...,0,0,0,0.002574
76,2019-03-14 23:13:00,"Perpetual Art Machine, de Anna Bella Geiger. V...",0,0,0,0.128573
174,2019-04-10 09:11:00,Original Bachor added to the collection of the...,0,0,0,0.45327
101,2019-04-02 14:43:00,"Leão de Veneza, de Mark Rothko. Escultura, Art...",1,0,0,0.046376
139,2019-03-19 04:03:00,The Art Institute of Chicago is hosting Every...,0,0,0,0.027524
198,2019-02-08 11:27:00,I'm at The Art Institute of Chicago - @ artins...,1,1,1,0.60308
59,2019-03-12 15:38:00,Art Institute was amazing! # rembrandt # beaut...,1,1,0,0.168298
96,2019-03-30 16:43:00,"Cut Piece, de Djanira. Performance, Art Instit...",1,0,1,0.633092
23,2019-03-27 10:48:00,Hopper @The Art Institute of Chicago https://w...,1,1,1,0.500968
30,2019-03-28 13:38:00,I'm at The Art Institute of Chicago - @ artins...,1,1,1,0.730363


In [58]:
score[1]

0.8181818135373005

### Using Random Forest

In [405]:
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=50)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(x_train_all,list(y_train_all))

yhat_classes=clf.predict(x_test_all)
y_label=y_test_all.astype('int32') 

In [406]:
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_label, yhat_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_label, yhat_classes)
print('Precision: %f' % precision)
recall = recall_score(y_label, yhat_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_label, yhat_classes)
print('F1 score: %f' % f1)

Accuracy: 0.909091
Precision: 0.833333
Recall: 0.714286
F1 score: 0.769231


In [357]:
#comparison list of the result

df=pd.DataFrame(data.iloc[x_test_all.index.values,:][['index','text']]).join(pd.DataFrame(data_Processed.iloc[x_test_all.index.values,:][['OpenHour']]))
df['label']=y_test_all
df['predict']=yhat_classes
df['predictvalue']=yhat_probs
df

Unnamed: 0,index,text,OpenHour,label,predict,predictvalue
136,2019-04-01 03:20:00,Art Institute of Chicago will be hosting Gregg...,0,0,0,0.009125
76,2019-03-14 23:13:00,"Perpetual Art Machine, de Anna Bella Geiger. V...",0,0,0,0.192214
174,2019-04-10 09:11:00,Original Bachor added to the collection of the...,0,0,0,0.246868
101,2019-04-02 14:43:00,"Leão de Veneza, de Mark Rothko. Escultura, Art...",1,0,0,0.068814
139,2019-03-19 04:03:00,The Art Institute of Chicago is hosting Every...,0,0,0,0.032295
198,2019-02-08 11:27:00,I'm at The Art Institute of Chicago - @ artins...,1,1,1,0.75639
59,2019-03-12 15:38:00,Art Institute was amazing! # rembrandt # beaut...,1,1,0,0.164905
96,2019-03-30 16:43:00,"Cut Piece, de Djanira. Performance, Art Instit...",1,0,0,0.828998
23,2019-03-27 10:48:00,Hopper @The Art Institute of Chicago https://w...,1,1,0,0.851443
30,2019-03-28 13:38:00,I'm at The Art Institute of Chicago - @ artins...,1,1,1,0.751096


### Using SVM

In [397]:
from sklearn import svm
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(x_train_all,list(y_train_all))
yhat_classes= SVM.predict(x_test_all)
y_label=y_test_all.astype('int32') 

In [398]:
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_label, yhat_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_label, yhat_classes)
print('Precision: %f' % precision)
recall = recall_score(y_label, yhat_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_label, yhat_classes)
print('F1 score: %f' % f1)

Accuracy: 0.848485
Precision: 0.600000
Recall: 0.857143
F1 score: 0.705882


In [399]:
#comparison list of the result

df=pd.DataFrame(data.iloc[x_test_all.index.values,:][['index','text']]).join(pd.DataFrame(data_Processed.iloc[x_test_all.index.values,:][['OpenHour']]))
df['label']=y_test_all
df['predict']=yhat_classes
df['predictvalue']=yhat_probs
df

Unnamed: 0,index,text,OpenHour,label,predict,predictvalue
136,2019-04-01 03:20:00,Art Institute of Chicago will be hosting Gregg...,0,0,0,0.002574
76,2019-03-14 23:13:00,"Perpetual Art Machine, de Anna Bella Geiger. V...",0,0,1,0.128573
174,2019-04-10 09:11:00,Original Bachor added to the collection of the...,0,0,0,0.45327
101,2019-04-02 14:43:00,"Leão de Veneza, de Mark Rothko. Escultura, Art...",1,0,0,0.046376
139,2019-03-19 04:03:00,The Art Institute of Chicago is hosting Every...,0,0,0,0.027524
198,2019-02-08 11:27:00,I'm at The Art Institute of Chicago - @ artins...,1,1,0,0.60308
59,2019-03-12 15:38:00,Art Institute was amazing! # rembrandt # beaut...,1,1,1,0.168298
96,2019-03-30 16:43:00,"Cut Piece, de Djanira. Performance, Art Instit...",1,0,1,0.633092
23,2019-03-27 10:48:00,Hopper @The Art Institute of Chicago https://w...,1,1,1,0.500968
30,2019-03-28 13:38:00,I'm at The Art Institute of Chicago - @ artins...,1,1,1,0.730363


In [408]:
from sklearn import naive_bayes
Naive = naive_bayes.MultinomialNB()
Naive.fit(x_train_all,list(y_train_all))
yhat_classes= Naive.predict(x_test_all)
y_label=y_test_all.astype('int32') 

ValueError: Input X must be non-negative