## Import drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## import libraries

In [None]:
import pandas as pd
import numpy as np
import nltk
import gensim
import re
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score
from sklearn.linear_model import LogisticRegression

from sklearn import svm
from sklearn.naive_bayes import MultinomialNB

# **MODULE 1**
### Subjectivity-objectivity classification

In [None]:
path_dataset='/content/drive/MyDrive/BTP/dataset.csv'
data = pd.read_csv(path_dataset)
data.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,"smart and alert , thirteen conversations about...",1
1,1,"color , musical bounce and warm seas lapping o...",1
2,2,it is not a mass-market entertainment but an u...,1
3,3,a light-hearted french film about the spiritua...,1
4,4,my wife is an actress has its moments in looki...,1


In [None]:
data=data.drop('Unnamed: 0',axis=1)
data.head()

Unnamed: 0,text,label
0,"smart and alert , thirteen conversations about...",1
1,"color , musical bounce and warm seas lapping o...",1
2,it is not a mass-market entertainment but an u...,1
3,a light-hearted french film about the spiritua...,1
4,my wife is an actress has its moments in looki...,1


In [None]:
# Making all letters lowercase
data['text'] = data['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
data.head()

Unnamed: 0,text,label
0,"smart and alert , thirteen conversations about...",1
1,"color , musical bounce and warm seas lapping o...",1
2,it is not a mass-market entertainment but an u...,1
3,a light-hearted french film about the spiritua...,1
4,my wife is an actress has its moments in looki...,1


In [None]:
data['text']

0       smart and alert , thirteen conversations about...
1       color , musical bounce and warm seas lapping o...
2       it is not a mass-market entertainment but an u...
3       a light-hearted french film about the spiritua...
4       my wife is an actress has its moments in looki...
                              ...                        
9995    in the end , they discover that balance in lif...
9996    a counterfeit 1000 tomin bank note is passed i...
9997    enter the beautiful and mysterious secret agen...
9998    after listening to a missionary from china spe...
9999    looking for a short cut to fame , glass concoc...
Name: text, Length: 10000, dtype: object

In [None]:
# Removing Punctuation, Symbols
data['text'] = data['text'].str.replace('[^\w\s]',' ')
data.head()

Unnamed: 0,text,label
0,smart and alert thirteen conversations about...,1
1,color musical bounce and warm seas lapping o...,1
2,it is not a mass market entertainment but an u...,1
3,a light hearted french film about the spiritua...,1
4,my wife is an actress has its moments in looki...,1


In [None]:
data['text']

0       smart and alert   thirteen conversations about...
1       color   musical bounce and warm seas lapping o...
2       it is not a mass market entertainment but an u...
3       a light hearted french film about the spiritua...
4       my wife is an actress has its moments in looki...
                              ...                        
9995    in the end   they discover that balance in lif...
9996    a counterfeit 1000 tomin bank note is passed i...
9997    enter the beautiful and mysterious secret agen...
9998    after listening to a missionary from china spe...
9999    looking for a short cut to fame   glass concoc...
Name: text, Length: 10000, dtype: object

In [None]:
data['text'] = data['text'].apply(lambda x: x.split())  #tokenize
data['text'].head()

0    [smart, and, alert, thirteen, conversations, a...
1    [color, musical, bounce, and, warm, seas, lapp...
2    [it, is, not, a, mass, market, entertainment, ...
3    [a, light, hearted, french, film, about, the, ...
4    [my, wife, is, an, actress, has, its, moments,...
Name: text, dtype: object

In [None]:
data2=data.copy()
for i in range(len(data['text'])):
    data2['text'][i] = ','.join(data['text'][i])    
data2['text'].head(20)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


0     smart,and,alert,thirteen,conversations,about,o...
1     color,musical,bounce,and,warm,seas,lapping,on,...
2     it,is,not,a,mass,market,entertainment,but,an,u...
3     a,light,hearted,french,film,about,the,spiritua...
4     my,wife,is,an,actress,has,its,moments,in,looki...
5     works,both,as,an,engaging,drama,and,an,incisiv...
6     even,a,hardened,voyeur,would,require,the,patie...
7     when,perry,fists,a,bull,at,the,moore,farm,it,s...
8     the,characters,are,paper,thin,and,their,person...
9     the,script,is,a,tired,one,with,few,moments,of,...
10    the,bland,outweighs,the,nifty,and,cletis,tout,...
11    directed,by,david,twohy,with,the,same,great,ey...
12    it,s,a,very,tasteful,rock,and,roll,movie,you,c...
13    provides,the,kind,of,laugh,therapy,i,need,from...
14    worth,a,look,by,those,on,both,sides,of,the,iss...
15    watching,the,film,is,like,reading,a,times,port...
16    despite,these,annoyances,the,capable,clayburgh...
17    it,s,a,good,thing,that,woolly,mammoths,are

In [None]:
X_train, X_val, y_train, y_val = train_test_split(data2.text.values, 
                                                  data2.label,test_size=0.2,random_state=0) 

#  TF-IDF vectorizer

In [None]:
tfidf = TfidfVectorizer(analyzer='word',stop_words='english')
tfidf.fit(data2['text'])
X_train_tfidf = tfidf.transform(X_train)
X_val_tfidf = tfidf.transform(X_val) 
print(X_train_tfidf.get_shape())
print(X_val_tfidf.get_shape())

(8000, 20588)
(2000, 20588)


### Model 1: logistic regression


In [None]:
logreg = LogisticRegression()
logreg.fit(X_train_tfidf, y_train)
y_pred = logreg.predict(X_val_tfidf)
print('logistic regression tfidf accuracy %s' % accuracy_score(y_val,y_pred,normalize=bool))

logistic regression tfidf accuracy 0.8785


In [None]:
print(confusion_matrix(y_val,y_pred))

[[912 100]
 [143 845]]


In [None]:
print("precision_score:",precision_score(y_val,y_pred))
print("recall_score:",recall_score(y_val,y_pred))
print("\nf1 score:",f1_score(y_val,y_pred))


precision_score: 0.8941798941798942
recall_score: 0.8552631578947368

f1 score: 0.8742886704604242


### Model 2: Linear SVM


In [None]:
from sklearn import svm
svm_=svm.SVC()
svm_.fit(X_train_tfidf, y_train)
y_pred = svm_.predict(X_val_tfidf)
print('svm using tfidf accuracy %s' % accuracy_score(y_val,y_pred))

svm using tfidf accuracy 0.888


In [None]:
print(confusion_matrix(y_val,y_pred))

[[913  99]
 [125 863]]


In [None]:
print("precision_score:",precision_score(y_val,y_pred))
print("recall_score:",recall_score(y_val,y_pred))
print("\nf1 score:",f1_score(y_val,y_pred))


precision_score: 0.8970893970893971
recall_score: 0.8734817813765182

f1 score: 0.8851282051282051


### Model 3: Multinomial Naive Bayes Classifier


In [None]:
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_val_tfidf)
print('naive bayes tfidf accuracy %s' % accuracy_score( y_val,y_pred))


naive bayes tfidf accuracy 0.897


In [None]:
print(confusion_matrix(y_val,y_pred))

[[910 102]
 [104 884]]


In [None]:
print("precision_score:",precision_score(y_val,y_pred))
print("recall_score:",recall_score(y_val,y_pred))
print("\nf1 score:",f1_score(y_val,y_pred))


precision_score: 0.896551724137931
recall_score: 0.8947368421052632

f1 score: 0.89564336372847


#  Count Vectors

In [None]:
count_vect = CountVectorizer(analyzer='word',stop_words='english')
count_vect.fit(data2['text'])
X_train_count =  count_vect.transform(X_train)
X_val_count =  count_vect.transform(X_val)
print(X_train_count.get_shape())
print(X_val_count.get_shape())
print(X_train[0])


(8000, 20588)
(2000, 20588)
nha,fala,my,voice,a,musical,comedy,is,the,story,of,vita,a,young,african,woman,who,must,never,sing


### Model 1: Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train_count, y_train)
y_pred = logreg.predict(X_val_count)
print('log reg count vectors accuracy %s' % accuracy_score(y_val,y_pred))


log reg count vectors accuracy 0.8815


In [None]:
print(confusion_matrix(y_val,y_pred))

[[908 104]
 [133 855]]


In [None]:
print("precision_score:",precision_score(y_val,y_pred))
print("recall_score:",recall_score(y_val,y_pred))
print("\nf1 score:",f1_score(y_val,y_pred))


precision_score: 0.8915537017726799
recall_score: 0.8653846153846154

f1 score: 0.8782742681047765


### Model 2: LinearSVM

In [None]:
from sklearn import svm
svm_count=svm.SVC()
svm_count.fit(X_train_count, y_train)
y_pred = svm_count.predict(X_val_count)
print('svm using tfidf accuracy %s' % accuracy_score(y_val,y_pred))

svm using tfidf accuracy 0.8695


In [None]:
print(confusion_matrix(y_val,y_pred))

[[905 107]
 [154 834]]


In [None]:
print("precision_score:",precision_score(y_val,y_pred))
print("recall_score:",recall_score(y_val,y_pred))
print("\nf1 score:",f1_score(y_val,y_pred))


precision_score: 0.8862911795961743
recall_score: 0.8441295546558705

f1 score: 0.864696734059098


### Model 3: Multinomial Naive Bayes Classifier


In [None]:
nb = MultinomialNB()
nb.fit(X_train_count, y_train)
y_pred = nb.predict(X_val_count)
print('naive bayes count vectors accuracy %s' % accuracy_score( y_val,y_pred))


naive bayes count vectors accuracy 0.901


In [None]:
print(confusion_matrix(y_val,y_pred))

[[906 106]
 [ 92 896]]


In [None]:
print("precision_score:",precision_score(y_val,y_pred))
print("recall_score:",recall_score(y_val,y_pred))
print("\nf1 score:",f1_score(y_val,y_pred))


precision_score: 0.8942115768463074
recall_score: 0.9068825910931174

f1 score: 0.9005025125628141


#Word2vec Embeddings 
### " GoogleNews-vectors-negative300 "

In [None]:
from gensim.models import Word2Vec
path_w2v='/content/drive/MyDrive/BTP/GoogleNews-vectors-negative300.bin'
model_w2v = gensim.models.KeyedVectors.load_word2vec_format(path_w2v, binary=True)

In [None]:
pre-trained word2vec model testing
print(model_w2v.most_similar("usa"))

In [None]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0: 
        vec /= count
    return vec

wordvec_arrays = np.zeros((len(data['text']), 300)) 
for i in range(len(data['text'])):
    wordvec_arrays[i,:] = word_vector(data['text'][i], 300)
wordvec_df = pd.DataFrame(wordvec_arrays)

# path='/content/sub_obj_w2v_predicted_numpy_df.csv'
# wordvec_df=pd.read_csv(path)
# wordvec_df=wordvec_df.drop('Unnamed: 0',axis=1)


wordvec_df.head() 

In [None]:
wordvec_df.shape

In [None]:
#wordvec_df.to_csv('sub_obj_w2v_predicted_numpy_df.csv')

In [None]:
X_train, X_val, y_train, y_val = train_test_split(wordvec_df, data.label,test_size=0.2)

### Model 1: Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train) 
y_pred = logreg.predict(X_val)
print('log reg word2vec %s' % accuracy_score( y_val,y_pred))
print("\nconfusion matrix:\n",confusion_matrix(y_val,y_pred))
print("precision_score:",precision_score(y_val,y_pred))
print("recall_score:",recall_score(y_val,y_pred))
print("\nf1 score:",f1_score(y_val,y_pred))

### Model 2: Linear SVM

In [None]:
from sklearn.svm import LinearSVC
clf=LinearSVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
print('lsvm using word2vec %s' % accuracy_score(y_pred, y_val))
print("\nconfusion matrix:\n",confusion_matrix(y_val,y_pred))
print("precision_score:",precision_score(y_val,y_pred))
print("recall_score:",recall_score(y_val,y_pred))
print("\nf1 score:",f1_score(y_val,y_pred))

In [None]:
nb = MultinomialNB()
nb.fit(X_train_count, y_train)
y_pred = nb.predict(X_val_count)
print('naive bayes count vectors accuracy %s' % accuracy_score( y_val,y_pred))
print("\nconfusion matrix:\n",confusion_matrix(y_val,y_pred))
print("precision_score:",precision_score(y_val,y_pred))
print("recall_score:",recall_score(y_val,y_pred))
print("\nf1 score:",f1_score(y_val,y_pred))




---



---



---



#Neural Network(Keras)

In [None]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
encoded_sentences = [one_hot(w,300) for w in data2['text']]
length = 300
padded_sent = pad_sequences(encoded_sentences, maxlen = length, padding = 'pre')
# Splitting into training and testing data in 90:10 ratio
X_train, X_val, y_train, y_val = train_test_split(padded_sent, data.label,
                                             random_state=42, test_size=0.3, 
                                             )
#defining the model 
mymodel = Sequential()
mymodel.add(Embedding(300,100, input_length= length))
mymodel.add(Flatten())
mymodel.add(Dense(1,activation='sigmoid')) 
mymodel.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
#fitting the model
mymodel.fit(X_train,y_train,batch_size = 32,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f937c46e990>



---



---



---



# **MODULE 2**
### Positive-Negative classification

In [None]:
path_dataset_pos_neg='/content/drive/My Drive/BTP/data_pos_neg.csv'
data_pos_neg = pd.read_csv(path_dataset_pos_neg)
data_pos_neg.head()

Unnamed: 0.1,Unnamed: 0,text,lable
0,0,"Based on an actual story, John Boorman shows t...",1
1,1,This is a gem. As a Film Four production - the...,1
2,2,"I really like this show. It has drama, romance...",1
3,3,This is the best 3-D experience Disney has at ...,1
4,4,"Of the Korean movies I've seen, only three had...",1


In [None]:
data_pos_neg.shape

(50000, 3)

In [None]:
data_pos_neg=data_pos_neg.drop('Unnamed: 0',axis=1)
data_pos_neg.head()

Unnamed: 0,text,lable
0,"Based on an actual story, John Boorman shows t...",1
1,This is a gem. As a Film Four production - the...,1
2,"I really like this show. It has drama, romance...",1
3,This is the best 3-D experience Disney has at ...,1
4,"Of the Korean movies I've seen, only three had...",1


In [None]:
data_pos_neg.shape

(50000, 2)

In [None]:
# Making all letters lowercase
data_pos_neg['text'] = data_pos_neg['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
data_pos_neg.head()

Unnamed: 0,text,lable
0,"based on an actual story, john boorman shows t...",1
1,this is a gem. as a film four production - the...,1
2,"i really like this show. it has drama, romance...",1
3,this is the best 3-d experience disney has at ...,1
4,"of the korean movies i've seen, only three had...",1


In [None]:
# Removing Punctuation, Symbols
data_pos_neg['text'] = data_pos_neg['text'].str.replace('[^\w\s]','')
data_pos_neg.head()

Unnamed: 0,text,lable
0,based on an actual story john boorman shows th...,1
1,this is a gem as a film four production the a...,1
2,i really like this show it has drama romance a...,1
3,this is the best 3d experience disney has at t...,1
4,of the korean movies ive seen only three had r...,1


In [None]:
data_pos_neg.to_csv("puncuation_removed_pos_neg_dataset.csv")

In [None]:
# #Removing Stop Words using NLTK
# from nltk.corpus import stopwords
# nltk.download('stopwords')
# stop = stopwords.words('english')
# data_pos_neg['text'] = data_pos_neg['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
# data_pos_neg.head()

In [None]:
data_pos_neg['text'] = data_pos_neg['text'].apply(lambda x: x.split())  #tokenize
data_pos_neg['text'].head()

0    [based, on, an, actual, story, john, boorman, ...
1    [this, is, a, gem, as, a, film, four, producti...
2    [i, really, like, this, show, it, has, drama, ...
3    [this, is, the, best, 3d, experience, disney, ...
4    [of, the, korean, movies, ive, seen, only, thr...
Name: text, dtype: object

In [None]:
data2=data_pos_neg.copy()
for i in range(len(data_pos_neg['text'])):
    data2['text'][i] = ','.join(data_pos_neg['text'][i])    
data2['text'].head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


0    based,on,an,actual,story,john,boorman,shows,th...
1    this,is,a,gem,as,a,film,four,production,the,an...
2    i,really,like,this,show,it,has,drama,romance,a...
3    this,is,the,best,3d,experience,disney,has,at,t...
4    of,the,korean,movies,ive,seen,only,three,had,r...
Name: text, dtype: object

In [None]:
data2.head()

Unnamed: 0,text,lable
0,"based,on,an,actual,story,john,boorman,shows,th...",1
1,"this,is,a,gem,as,a,film,four,production,the,an...",1
2,"i,really,like,this,show,it,has,drama,romance,a...",1
3,"this,is,the,best,3d,experience,disney,has,at,t...",1
4,"of,the,korean,movies,ive,seen,only,three,had,r...",1


In [None]:
data2.tail()

Unnamed: 0,text,lable
49995,"about,a,year,ago,i,finally,gave,up,on,american...",1
49996,"when,i,saw,the,elaborate,dvd,box,for,this,and,...",1
49997,"last,november,i,had,a,chance,to,see,this,film,...",1
49998,"great,movie,i,loved,it,great,editing,and,use,o...",1
49999,"enchanted,april,is,a,tone,poem,an,impressionis...",1


In [None]:
X_train, X_val, y_train, y_val = train_test_split(data2.text.values
                                 ,data2.lable,test_size=0.3,random_state=0) 

# TF-IDF vectorizer

In [None]:
tfidf = TfidfVectorizer()
tfidf.fit(data2['text'])
X_train_tfidf = tfidf.transform(X_train)
X_val_tfidf = tfidf.transform(X_val) 
print(X_train_tfidf.get_shape())
print(X_val_tfidf.get_shape())

(35000, 181018)
(15000, 181018)


### Model 1: logistic regression


In [None]:
logreg = LogisticRegression()
logreg.fit(X_train_tfidf, y_train)
y_pred = logreg.predict(X_val_tfidf)
print('logistic regression tfidf accuracy %s' % accuracy_score(y_val,y_pred,normalize=bool));

logistic regression tfidf accuracy 0.8950666666666667


In [None]:
print(confusion_matrix(y_val,y_pred)) 

[[6623  838]
 [ 736 6803]]


In [None]:
print("precision_score:",precision_score(y_val,y_pred))
print("recall_score:",recall_score(y_val,y_pred))
print("f1_score:",f1_score(y_val,y_pred))



precision_score: 0.8903284910352048
recall_score: 0.9023743202016182
f1_score: 0.8963109354413702


### Model 2: Linear SVM



In [None]:
from sklearn import svm
svm_=svm.LinearSVC()
svm_.fit(X_train_tfidf, y_train)
y_pred = svm_.predict(X_val_tfidf)
print('svm using tfidf accuracy %s' % accuracy_score(y_val,y_pred))

svm using tfidf accuracy 0.9023333333333333


In [None]:
print("precision_score:",precision_score(y_val,y_pred))
print("recall_score:",recall_score(y_val,y_pred))
print("f1_score:",f1_score(y_val,y_pred))

precision_score: 0.8985564304461943
recall_score: 0.9082106380156519
f1_score: 0.9033577412758098


### Model 3: Multinomial Naive Bayes Classifier


In [None]:
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_val_tfidf)
print('naive bayes tfidf accuracy %s' % accuracy_score( y_val,y_pred))


naive bayes tfidf accuracy 0.8638


In [None]:
print("precision_score:",precision_score(y_val,y_pred))
print("recall_score:",recall_score(y_val,y_pred))
print("f1_score:",f1_score(y_val,y_pred))

precision_score: 0.8912300683371298
recall_score: 0.8303488526329752
f1_score: 0.8597129712284556


#  Count Vectors

In [None]:
count_vect = CountVectorizer(analyzer='word',stop_words='english')
count_vect.fit(data2['text'])
X_train_count =  count_vect.transform(X_train)
X_val_count =  count_vect.transform(X_val)
print(X_train_count.get_shape())
print(X_val_count.get_shape())
print(X_train[0])


(35000, 180705)
(15000, 180705)
this,movie,was,produced,by,the,biggest,producer,in,costa,rica,although,their,authors,brag,about,it,as,a,the,biggest,movie,in,costa,rica,ever,made,and,their,actors,even,dare,to,say,that,they,didnt,get,an,oscar,nomination,due,to,its,political,relevance,with,oil,he,he,right,well,this,is,all,a,lie,this,movie,was,supposedly,based,on,a,book,written,by,carlos,salazar,herrera,about,a,love,triangle,super,cliché,soap,operalike,subject,and,it,secondary,story,is,about,an,oil,problem,on,the,atlantic,coast,of,costa,rica,with,an,american,oil,company,who,wanted,to,explore,oil,deposits,on,the,region,but,at,the,end,it,never,was,approved,by,the,government,now,it,may,have,some,nice,footage,about,the,beaches,but,thats,it,nothing,bigger,than,that,it,is,all,in,the,camera,most,of,the,actors,are,lousy,except,for,two,or,three,the,rest,lack,of,sense,of,what,acting,means,they,overact,most,of,the,time,the,story,is,completely,common,and,cliché,worthy,of,a,cheap,mexican,soap,opera,as,

### Model 1: logistic regression


In [None]:
logreg = LogisticRegression()
logreg.fit(X_train_count, y_train)
y_pred = logreg.predict(X_val_count)
print('log reg count vectors accuracy %s' % accuracy_score(y_val,y_pred))


log reg count vectors accuracy 0.8874


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
print("precision_score:",precision_score(y_val,y_pred))
print("recall_score:",recall_score(y_val,y_pred))
print("f1_score:",f1_score(y_val,y_pred))

precision_score: 0.8864957716701902
recall_score: 0.8899058230534553
f1_score: 0.8881975243264711


### Model 2: LinearSVM

In [None]:
svm_count=svm.LinearSVC()
svm_count.fit(X_train_count, y_train)
y_pred = svm_count.predict(X_val_count)
print('svm using tfidf accuracy %s' % accuracy_score(y_val,y_pred))

svm using tfidf accuracy 0.8704666666666667




In [None]:
print("precision_score:",precision_score(y_val,y_pred))
print("recall_score:",recall_score(y_val,y_pred))
print("f1_score:",f1_score(y_val,y_pred))

precision_score: 0.8713830634457128
recall_score: 0.8708051465711633
f1_score: 0.8710940091554435


### Model 3: Multinomial Naive Bayes Classifier


In [None]:
nb = MultinomialNB()
nb.fit(X_train_count, y_train)
y_pred = nb.predict(X_val_count)
print('naive bayes count vectors accuracy %s' % accuracy_score( y_val,y_pred))


naive bayes count vectors accuracy 0.8552


In [None]:
print("precision_score:",precision_score(y_val,y_pred))
print("recall_score:",recall_score(y_val,y_pred))
print("f1_score:",f1_score(y_val,y_pred))

precision_score: 0.8717273860645519
recall_score: 0.8347260909935005
f1_score: 0.8528255861227809


#Word2vec Embeddings 
### " GoogleNews-vectors-negative300 "

In [None]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0: 
        vec /= count
    return vec

wordvec_arrays = np.zeros((len(data_pos_neg['text']), 300)) 
for i in range(len(data_pos_neg['text'])):
    wordvec_arrays[i,:] = word_vector(data_pos_neg['text'][i], 300)
wordvec_df = pd.DataFrame(wordvec_arrays)

# #wordvec_df.to_csv('pos_neg_w2v_predicted_numpy_df.csv')
# path='/content/pos_neg_w2v_predicted_numpy_df.csv'
# wordvec_df=pd.read_csv(path)
# wordvec_df=wordvec_df.drop('Unnamed: 0',axis=1)

wordvec_df.head() 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,0.057534,0.052148,0.016921,0.054128,-0.014263,-0.022236,0.017315,-0.10311,0.03927,0.062262,-0.009213,-0.148776,-0.0284,0.014905,-0.086781,0.044017,0.053748,0.110088,-0.010585,-0.03901,-0.002652,0.046148,0.022057,-0.026244,0.028429,-0.025179,-0.070341,0.064091,0.05411,-0.024177,0.021714,-0.015587,-0.068122,-0.004571,-0.011416,-0.017683,0.014111,-0.002447,0.009307,0.05162,...,-0.014681,0.003226,-0.051261,0.055903,0.054167,0.132023,2.7e-05,0.006499,-0.042062,-0.00088,0.05455,0.067553,0.080072,0.037671,0.075043,-0.059268,-0.032497,-0.092286,-0.026034,-0.014472,0.04235,-0.038308,-0.007455,0.067884,0.017166,0.009906,-0.107678,-0.013344,-0.008678,0.067193,-0.091669,0.016437,-0.117543,0.020934,-0.04465,0.007014,0.003082,-0.040989,0.056434,-0.030823
1,0.06159,0.044422,0.029733,0.089596,-0.041061,-0.016051,0.055132,-0.078687,0.088635,0.062757,-0.030776,-0.128093,-0.009003,0.031601,-0.112643,0.068739,0.052981,0.066818,0.013643,-0.064724,-0.020455,0.0572,-6.6e-05,-0.005388,0.061072,-0.017949,-0.051453,0.048008,0.049268,0.01072,-0.031948,0.016688,-0.019118,0.007445,0.024383,-0.006881,0.029107,-0.015346,0.044988,0.062435,...,-0.036019,0.007171,-0.032719,0.030005,0.043627,0.143027,0.004686,-0.009668,-0.06313,-0.003305,0.062128,0.09735,0.091865,0.03272,0.072095,-0.06306,-0.048402,-0.075683,-0.031958,-0.013191,0.014303,-0.066969,0.00963,0.049943,0.021414,0.004902,-0.085624,-0.014087,-0.007574,0.038036,-0.091333,0.011221,-0.085027,0.01731,-0.050747,-0.020466,0.013848,-0.049637,0.039857,-0.017853
2,0.004385,0.007163,0.023529,0.123482,-0.055625,0.004787,0.038118,-0.083174,0.070619,0.077726,-0.035618,-0.125515,-0.038758,-0.012552,-0.095129,0.068464,0.058758,0.105272,0.02783,-0.050708,-0.067062,0.02425,0.057701,-0.020073,0.024755,-0.004671,-0.094764,0.037403,0.034137,0.010946,-0.029288,0.02401,-0.048219,-0.014171,0.011199,0.015607,0.011373,-0.012639,0.005331,0.061892,...,-0.014292,-0.02506,-0.016557,0.033752,0.004957,0.144354,-0.012341,-0.008695,-0.044218,0.008171,0.056812,0.067532,0.122696,0.05389,0.059113,-0.057497,-0.06295,-0.120161,-0.045356,-0.012723,0.023112,0.007869,0.009655,0.058937,0.032826,-0.010206,-0.058334,-0.066434,0.000789,0.05411,-0.045683,0.05991,-0.083039,0.005264,-0.027817,-0.022943,-0.00803,-0.033065,0.031746,-0.044368
3,0.053974,0.013859,0.01218,0.101552,-0.034833,0.005551,0.014865,-0.087893,0.063488,0.093657,-0.038968,-0.113278,-0.023742,0.029507,-0.069268,0.073352,0.03932,0.09698,-0.014636,-0.043431,-0.045278,0.079044,0.000469,-0.019651,0.027498,-0.026251,-0.07998,0.057272,0.029137,-0.009249,-0.041595,0.030834,-0.064425,0.016007,0.033028,-0.024028,0.006477,0.010928,0.063325,0.078209,...,-0.034513,0.009815,-0.011189,0.04724,0.057694,0.131466,-0.012587,-0.041113,-0.048135,-0.024752,0.018776,0.084071,0.056571,0.030219,0.084574,-0.065687,-0.072788,-0.06853,-0.04584,-0.021196,-0.008889,0.008126,-0.012153,0.067473,0.032363,0.026041,-0.081856,-0.042581,0.002956,0.055275,-0.132166,0.015249,-0.096143,0.032345,-0.038638,0.001738,0.000473,-0.047921,0.033484,-0.033453
4,0.062477,0.032065,0.019158,0.111853,-0.061255,-0.004315,0.020237,-0.063219,0.069133,0.077688,-0.049687,-0.124248,-0.024101,0.050642,-0.095223,0.071218,0.042091,0.099225,-0.020163,-0.058511,-0.001711,0.027954,0.040872,-0.011299,0.046446,-0.041658,-0.086516,0.062818,0.045324,0.005419,-0.035115,0.011676,-0.035521,0.02643,0.029617,-0.032784,0.0477,-0.000677,0.046638,0.066043,...,-0.041413,0.006946,-0.040575,0.04878,0.034779,0.132343,-0.010118,-0.01885,-0.038659,-0.009305,0.035043,0.074222,0.075527,0.041361,0.05647,-0.03664,-0.055809,-0.072536,-0.042823,-0.004977,0.021633,-0.032664,0.034512,0.060427,0.045073,0.031814,-0.065223,-0.03739,-0.007063,0.071795,-0.069386,0.025435,-0.089948,0.018675,-0.056864,-0.012915,0.003331,-0.046286,0.028899,0.009456


In [None]:
wordvec_df.shape

(50000, 300)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(wordvec_df, data2.lable,test_size=0.3)

### Model 1: logistic regression


In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_val)
print('logistic regression tfidf accuracy %s' % accuracy_score(y_val,y_pred,normalize=bool))
print("precision_score:",precision_score(y_val,y_pred))
print("recall_score:",recall_score(y_val,y_pred))
print("f1_score:",f1_score(y_val,y_pred))

logistic regression tfidf accuracy 0.855
precision_score: 0.8572581730122427
recall_score: 0.8511888859203847
f1_score: 0.8542127488437563


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Model 2: Linear SVM



In [None]:

clf=LinearSVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
print('lsvm using word2vec %s' % accuracy_score(y_val, y_pred))
print("precision_score:",precision_score(y_val,y_pred))
print("recall_score:",recall_score(y_val,y_pred))
print("f1_score:",f1_score(y_val,y_pred))

lsvm using word2vec 0.8621333333333333
precision_score: 0.8606230031948882
recall_score: 0.8636120758749666
f1_score: 0.8621149486598212


### Model 3: Multinomial Naive Bayes Classifier


In [None]:
nb = MultinomialNB()
nb.fit(X_train_count, y_train)
y_pred = nb.predict(X_val_count)
print('naive bayes count vectors accuracy %s' % accuracy_score( y_val,y_pred))
print("precision_score:",precision_score(y_val,y_pred))
print("recall_score:",recall_score(y_val,y_pred))
print("f1_score:",f1_score(y_val,y_pred))


naive bayes count vectors accuracy 0.49893333333333334
precision_score: 0.49798549556809024
recall_score: 0.49532460593107136
f1_score: 0.4966514867398875
