In [1]:
import pandas as pd
import numpy as np
import scipy
import re
import csv
import operator
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression # load the library


In [2]:
df = pd.read_csv("sentiment.csv",encoding='ISO-8859-1') 

In [3]:
df.head(3)

Unnamed: 0,sentiment,text
0,Positive,RT @ScottWalker: Didn't catch the full #GOPdeb...
1,Positive,RT @RobGeorge: That Carly Fiorina is trending ...
2,Positive,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...


In [4]:
df.columns

Index(['sentiment', 'text'], dtype='object')

In [5]:
df.shape

(10729, 2)

In [6]:
df['sentiment'].unique()

array(['Positive', 'Negative'], dtype=object)

In [7]:
df['sentiment'].value_counts()

Negative    8493
Positive    2236
Name: sentiment, dtype: int64

# Convert labels into a machine readable format

In [8]:
from sklearn import preprocessing


In [9]:
train_class_y = ['Negative','Positive']

In [10]:
le = preprocessing.LabelEncoder()

In [11]:
le.fit(train_class_y)

LabelEncoder()

In [12]:
train_y_ = le.transform(df['sentiment'])

In [13]:
train_y_

array([1, 1, 1, ..., 1, 0, 1])

In [14]:
le.transform(['Positive','Negative','Positive']) 

array([1, 0, 1])

# Functions to clean the text  

In [15]:
# clean the text
def CleanText(raw_comment):
    # 1. lower case
    new_comment = raw_comment.lower()
    # 2. remove punctuation
    new_comment = re.sub(r"[^\w\s]", "", new_comment)
    
      #add something new
    #new_comment = new_comment.replace('\n','').strip()
    #new_comment = new_comment.replace(u'\u2018',"'").replace(u'\u2019',"'") 
    #new_comment = new_comment.replace('n\'t',' not')
    #new_comment = new_comment.replace('RT','')
    #new_comment = re.sub(r"^.*http.*$", '', new_comment)
    #new_comment = re.sub(r'[^\x00-\x7F]+','', new_comment)
    #new_comment = new_comment.replace('gop','')
    #new_comment = new_comment.replace('debate','')
    #new_comment = new_comment.replace('gopdeb','')
    #new_comment = new_comment.replace('gopdebate','')
    #new_comment = new_comment.replace('gopdebates','')
    #new_comment = new_comment.replace('fox','')
    #new_comment = new_comment.replace('news','')
    #new_comment = new_comment.replace('foxnew','')
    #new_comment = new_comment.replace('foxnes','')
    #new_comment = new_comment.replace('amp','')
    
    
    return new_comment

#Remove stop words
stop_words = set(stopwords.words('english'))
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)

#Stemming
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

In [16]:
#df['text'] = df['text'].apply(CleanText)
#df['text'] = df['text'].apply(removeStopWords)
#df['text'] = df['text'].apply(stemming)

In [17]:
df.head(3)

Unnamed: 0,sentiment,text
0,Positive,RT @ScottWalker: Didn't catch the full #GOPdeb...
1,Positive,RT @RobGeorge: That Carly Fiorina is trending ...
2,Positive,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...


# Split train and test

In [18]:
train, test = train_test_split(df, test_size = .10, random_state=7)
#xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.2, random_state = 0)
train_y = le.transform(train['sentiment'])
test.shape

(1073, 2)

In [19]:
train['text'] = train['text'].apply(CleanText)
train['text'] = train['text'].apply(removeStopWords)
train['text'] = train['text'].apply(stemming)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [20]:
train_x = train['text']


# Tokenizing using bag of words - UNI 

In [21]:
count_vect = CountVectorizer()

In [22]:
X_train_counts = count_vect.fit_transform(train_x)

In [23]:
X_train_counts.shape

(9656, 12533)

In [24]:
type(X_train_counts)

scipy.sparse.csr.csr_matrix

In [25]:
X_train_counts.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [26]:
type(X_train_counts)

scipy.sparse.csr.csr_matrix

In [27]:
np.count_nonzero(X_train_counts.toarray())

107171

In [28]:
count_vect.get_feature_names()

['07',
 '0iiiiiii0_girl',
 '10',
 '100',
 '10000',
 '100000',
 '10000x',
 '1000s',
 '100kyr',
 '100s',
 '1015',
 '106',
 '10k',
 '11',
 '1100',
 '11000',
 '1111',
 '116',
 '11th',
 '12',
 '1216bj',
 '13',
 '130',
 '13m',
 '14',
 '140charact',
 '143',
 '14th',
 '15',
 '1525',
 '15min',
 '15yo',
 '16',
 '160',
 '16m',
 '17',
 '170',
 '176b',
 '18',
 '18000',
 '1800s',
 '1828',
 '187k',
 '189',
 '19',
 '1954',
 '1955',
 '1960s',
 '1965',
 '1980',
 '1980s',
 '1996',
 '1997',
 '19b',
 '19h19',
 '19million',
 '1aâstopiran',
 '1brian',
 '1catherinesiena',
 '1hr',
 '1m',
 '1marchella',
 '1marcorubio',
 '1on1',
 '1st',
 '1stplace',
 '1what',
 '20',
 '2000',
 '200000',
 '2000s',
 '2001',
 '20012009',
 '2003',
 '2004',
 '2007',
 '2008',
 '2009',
 '2009â',
 '2010',
 '201112',
 '2012',
 '2013',
 '2013_tiffani',
 '2015',
 '2016',
 '2016elect',
 '206',
 '20th',
 '20x',
 '21',
 '213',
 '215000',
 '21million',
 '21st',
 '21th',
 '22',
 '229b',
 '22aday',
 '22nd',
 '23',
 '23115',
 '24000',
 '247',
 '24

## naive bayes for BOW + UNI 

In [29]:
mnb = MultinomialNB(alpha=1.0)
mnb.fit(X_train_counts,train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [30]:
test['cleantxt'] = test['text'].apply(CleanText)
test['cleantxt'] = test['cleantxt'].apply(removeStopWords)
test['cleantxt'] = test['cleantxt'].apply(stemming)
test_x = count_vect.transform(test['cleantxt'])
test_x_array = test_x.toarray()
test_y = le.transform(test['sentiment'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [31]:
predictions = mnb.predict(test_x_array)

In [32]:
count = 0 
for i in range (len(predictions)):
    if predictions[i]==test_y[i]:
        count=count+1
count

909

In [33]:
count/1073

0.8471575023299162

In [34]:
confusion_matrix(test_y, predictions,labels=[0,1])

array([[812,  41],
       [123,  97]])

In [35]:
#Write into csv
with open('uniwrong.csv', 'w') as f:
    writer = csv.writer(f)
    count = 0
    writer.writerow(["text","prediction", "sentiment"])
    for i in range (len(predictions)):
        if predictions[i]!=test_y[i]:
            if predictions[i] == 0 and test_y[i] == 1:
                writer.writerow([str(test.iloc[i].text), 'negtive','positive'])
            elif  predictions[i] == 1 and test_y[i] == 0: 
                writer.writerow([str(test.iloc[i].text), 'positive','negtive'])
            count += 1
count

164

In [36]:
#Write into csv
with open('uniright.csv', 'w') as f:
    writer = csv.writer(f)
    count = 0
    writer.writerow(["text","prediction", "sentiment"])
    for i in range (len(predictions)):
        if predictions[i]==test_y[i]:
            if predictions[i] == 0 and test_y[i] == 0:
                writer.writerow([str(test.iloc[i].text), 'negtive','negative'])
            elif  predictions[i] == 1 and test_y[i] == 1: 
                writer.writerow([str(test.iloc[i].text), 'positive','positive'])
            count += 1
count

909

# Tokenizing using bag of words - UNI + BI

In [21]:
count_vect = CountVectorizer(ngram_range = (1,2))

In [22]:
X_train_counts = count_vect.fit_transform(train_x)

In [23]:
X_train_counts.shape

(9656, 61807)

In [24]:
X_train_counts.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [25]:
np.count_nonzero(X_train_counts.toarray()) ## Just checking for non zeros. 

207490

In [26]:
count_vect.get_feature_names()

['07',
 '07 fun',
 '0iiiiiii0_girl',
 '0iiiiiii0_girl take',
 '10',
 '10 2001',
 '10 angri',
 '10 appt',
 '10 best',
 '10 billion',
 '10 candid',
 '10 carlyfiorina',
 '10 contend',
 '10 couldnt',
 '10 cringeworthi',
 '10 donat',
 '10 dread',
 '10 gotten',
 '10 gov',
 '10 guy',
 '10 horsemen',
 '10 httptconp0vngnotx',
 '10 httptcovxszukmivb',
 '10 httâ',
 '10 insan',
 '10 man',
 '10 mani',
 '10 men',
 '10 mil',
 '10 million',
 '10 millionair',
 '10 min',
 '10 moment',
 '10 mudsling',
 '10 peopl',
 '10 sec',
 '10 straight',
 '10 wealthi',
 '10 white',
 '10 women',
 '10 year',
 '10 yrs',
 '100',
 '100 almost',
 '100 even',
 '100 hilari',
 '100 min',
 '100 minut',
 '100 rate',
 '100 sergey',
 '100 support',
 '100 sure',
 '100 vote',
 '10000',
 '10000 follow',
 '100000',
 '100000 1996',
 '100000 protest',
 '10000x',
 '10000x better',
 '1000s',
 '1000s job',
 '1000s union',
 '100kyr',
 '100kyr consid',
 '100s',
 '100s 1000s',
 '1015',
 '1015 carson',
 '106',
 '106 billion',
 '10k',
 '10k new

## naive bayes for BOW

In [27]:
mnb = MultinomialNB(alpha=1.0)
mnb.fit(X_train_counts,train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [28]:
test['cleantxt'] = test['text'].apply(CleanText)
test['cleantxt'] = test['cleantxt'].apply(removeStopWords)
test['cleantxt'] = test['cleantxt'].apply(stemming)
test_x = count_vect.transform(test['cleantxt'])
test_x_array = test_x.toarray()
test_y = le.transform(test['sentiment'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [29]:
predictions = mnb.predict(test_x_array)

In [30]:
count = 0 
for i in range (len(predictions)):
    if predictions[i]==test_y[i]:
        count=count+1
count

917

In [31]:
count/1073

0.8546132339235788

In [32]:
confusion_matrix(test_y, predictions,labels=[0,1])

array([[824,  29],
       [127,  93]])

In [33]:
#Write into csv
with open('unibiwrong.csv', 'w') as f:
    writer = csv.writer(f)
    count = 0
    writer.writerow(["text","prediction", "sentiment"])
    for i in range (len(predictions)):
        if predictions[i]!=test_y[i]:
            if predictions[i] == 0 and test_y[i] == 1:
                writer.writerow([str(test.iloc[i].text), 'negtive','positive'])
            elif  predictions[i] == 1 and test_y[i] == 0: 
                writer.writerow([str(test.iloc[i].text), 'positive','negtive'])
            count += 1
count

156

In [34]:
#Write into csv
with open('unibiright.csv', 'w') as f:
    writer = csv.writer(f)
    count = 0
    writer.writerow(["text","prediction", "sentiment"])
    for i in range (len(predictions)):
        if predictions[i]==test_y[i]:
            if predictions[i] == 0 and test_y[i] == 0:
                writer.writerow([str(test.iloc[i].text), 'negtive','negative'])
            elif  predictions[i] == 1 and test_y[i] == 1: 
                writer.writerow([str(test.iloc[i].text), 'positive','positive'])
            count += 1
count

917

# BOW - UNI +  Intensified bi 

## prepare features

In [37]:
intbidf = pd.read_csv("intbi.csv",encoding='ISO-8859-1') 

In [38]:
intbidf.shape

(104, 1)

In [39]:
dict = {}
for p in intbidf['intensified bi']:
    count = 0
    for text in train_x:
        if p in text:
            count += 1
    dict[p] = count

In [40]:
sorted_dict = sorted(dict.items(), key=operator.itemgetter(1), reverse = True)

In [41]:
top_sorted_dict = sorted_dict[:10]
top_sorted_dict

[('disappoint fox', 131),
 ('great job', 19),
 ('ass gopdeb', 18),
 ('best line', 18),
 ('absolut fear', 17),
 ('straight outta', 16),
 ('good gopdeb', 15),
 ('biggest loser', 14),
 ('great joke', 14),
 ('better answer', 13)]

In [42]:

biglist = []
for i in train_x:
    mylist =[]
    for a,b in top_sorted_dict:
        if a in i:
           mylist.append(1)
        else:
            mylist.append(0)
      
    biglist.append(mylist);
    


In [43]:
len(biglist)
#len(biglist[100])

9656

In [44]:
intbi_train_x=np.array([np.array(xi) for xi in biglist])
intbi_train_x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

#### combine features

In [45]:
intbi_train_x.shape

(9656, 10)

In [46]:
X_train_counts.toarray().shape

(9656, 12533)

In [47]:
type(intbi_train_x)

numpy.ndarray

In [48]:
type(X_train_counts.toarray())

numpy.ndarray

In [49]:
c_train_x = np.concatenate((X_train_counts.toarray(), intbi_train_x), axis=1) 

In [50]:
c_train_x.shape

(9656, 12543)

## naive bayes for uni + Intensified bi 

In [51]:
mnb = MultinomialNB(alpha=1.0)
mnb.fit(c_train_x,train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Prepare test data features

In [52]:
test['cleantxt'] = test['text'].apply(CleanText)
test['cleantxt'] = test['cleantxt'].apply(removeStopWords)
test['cleantxt'] = test['cleantxt'].apply(stemming)
test_x = count_vect.transform(test['cleantxt'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [53]:
test_x_array = test_x.toarray()

In [54]:
test_x_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [55]:
biglist = []
for i in test['cleantxt']:
    mylist =[]
    for a,b in top_sorted_dict:
        if a in i:
            mylist.append(1)
        else:
            mylist.append(0)
      
    biglist.append(mylist);

In [56]:
len(biglist)
len(biglist[100])

10

In [57]:
intbi_test_x=np.array([np.array(xi) for xi in biglist])
intbi_test_x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [58]:
test_y = le.transform(test['sentiment'])

### combine test data features

In [59]:
c_test_x = np.concatenate((test_x_array, intbi_test_x), axis=1) 

In [60]:
predictions = mnb.predict(c_test_x)

In [61]:
count = 0 
for i in range (len(predictions)):
    if predictions[i]==test_y[i]:
        count=count+1
count

911

In [62]:
count/1073

0.8490214352283317

In [63]:
confusion_matrix(test_y, predictions,labels=[0,1])

array([[811,  42],
       [120, 100]])

In [64]:
#Write into csv
with open('uniintbiwrong.csv', 'w') as f:
    writer = csv.writer(f)
    count = 0
    writer.writerow(["text","prediction", "sentiment"])
    for i in range (len(predictions)):
        if predictions[i]!=test_y[i]:
            if predictions[i] == 0 and test_y[i] == 1:
                writer.writerow([str(test.iloc[i].text), 'negtive','positive'])
            elif  predictions[i] == 1 and test_y[i] == 0: 
                writer.writerow([str(test.iloc[i].text), 'positive','negtive'])
            count += 1
count

162

In [66]:
#Write into csv
with open('uniintbiright.csv', 'w') as f:
    writer = csv.writer(f)
    count = 0
    writer.writerow(["text","prediction", "sentiment"])
    for i in range (len(predictions)):
        if predictions[i]==test_y[i]:
            if predictions[i] == 0 and test_y[i] == 0:
                writer.writerow([str(test.iloc[i].text), 'negtive','negative'])
            elif  predictions[i] == 1 and test_y[i] == 1: 
                writer.writerow([str(test.iloc[i].text), 'positive','positive'])
            count += 1
count

911

# BOW - UNI + Intensified uni 

## prepare features

In [29]:
intunidf = pd.read_csv("intuni.csv",encoding='ISO-8859-1') 

In [30]:
intunidf.shape

(35, 1)

In [31]:
dict = {}
for p in intunidf['intensified uni']:
    count = 0
    for text in train_x:
        if p in text:
            count += 1
    dict[p] = count

In [32]:
sorted_dict = sorted(dict.items(), key=operator.itemgetter(1), reverse = True)

In [33]:
top_sorted_dict = sorted_dict[:10]
top_sorted_dict

[('ass', 307),
 ('great', 195),
 ('good', 193),
 ('best', 136),
 ('fuck', 118),
 ('better', 110),
 ('bad', 100),
 ('far', 85),
 ('total', 49),
 ('import', 43)]

In [34]:
biglist = []
for i in train_x:
    mylist =[]
    for a,b in top_sorted_dict:
        if a in i:
            mylist.append(1)
        else:
            mylist.append(0)
      
    biglist.append(mylist);

In [35]:
len(biglist)
len(biglist[100])

10

In [36]:
intuni_train_x=np.array([np.array(xi) for xi in biglist])
intuni_train_x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### combine features

In [37]:
c_train_x = np.concatenate((X_train_counts.toarray(), intuni_train_x), axis=1) 

## naive bayes for UNI + Intensified uni 

In [38]:
mnb = MultinomialNB(alpha=1.0)
mnb.fit(c_train_x,train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [39]:
test['cleantxt'] = test['text'].apply(CleanText)
test['cleantxt'] = test['cleantxt'].apply(removeStopWords)
test['cleantxt'] = test['cleantxt'].apply(stemming)
test_x = count_vect.transform(test['cleantxt'])
test_x_array = test_x.toarray()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [40]:
biglist = []
for i in test['cleantxt']:
    mylist =[]
    for a,b in top_sorted_dict:
        if a in i:
            mylist.append(1)
        else:
            mylist.append(0)
      
    biglist.append(mylist);

In [41]:
len(biglist)
len(biglist[100])

10

In [42]:
intuni_test_x=np.array([np.array(xi) for xi in biglist])
intuni_test_x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]])

In [43]:
c_test_x = np.concatenate((test_x_array, intuni_test_x), axis=1) 

In [44]:
test_y = le.transform(test['sentiment'])

In [45]:
predictions = mnb.predict(c_test_x)

In [46]:
count = 0 
for i in range (len(predictions)):
    if predictions[i]==test_y[i]:
        count=count+1
count

905

In [47]:
count/1073

0.8434296365330848

In [48]:
confusion_matrix(test_y, predictions,labels=[0,1])

array([[805,  48],
       [120, 100]])

# BOW - UNI + Unitensified Bi

## Prepare features

In [29]:
unbidf = pd.read_csv("unbi.csv",encoding='ISO-8859-1') 

In [30]:
unbidf.shape

(211, 1)

In [31]:
dict = {}
for p in unbidf['unitensified bi']:
    count = 0
    for text in train_x:
        if p in text:
            count += 1
    dict[p] = count

In [32]:
sorted_dict = sorted(dict.items(), key=operator.itemgetter(1), reverse = True)

In [33]:
top_sorted_dict = sorted_dict[:10]
top_sorted_dict

[('fair amp', 188),
 ('obvious tri', 135),
 ('legitim question', 63),
 ('realli like', 58),
 ('right gopdeb', 55),
 ('mean realdonaldtrump', 52),
 ('right mean', 51),
 ('right fight', 39),
 ('close statement', 29),
 ('right nobodi', 23)]

In [34]:
biglist = []
for i in train_x:
    mylist =[]
    for a,b in top_sorted_dict:
        if a in i:
            mylist.append(1)
        else:
            mylist.append(0)
      
    biglist.append(mylist);

In [35]:
len(biglist)
len(biglist[100])

10

In [36]:
unbi_train_x=np.array([np.array(xi) for xi in biglist])
unbi_train_x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [37]:
c_train_x = np.concatenate((X_train_counts.toarray(), unbi_train_x), axis=1) 

## naive bayes for UNI + Unitensified Bi

In [38]:
mnb = MultinomialNB(alpha=1.0)
mnb.fit(c_train_x,train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [39]:
test['cleantxt'] = test['text'].apply(CleanText)
test['cleantxt'] = test['cleantxt'].apply(removeStopWords)
test_x = count_vect.transform(test['cleantxt'])
test_x_array = test_x.toarray()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [40]:
biglist = []
for i in test['cleantxt']:
    mylist =[]
    for a,b in top_sorted_dict:
        if a in i:
            mylist.append(1)
        else:
            mylist.append(0)
      
    biglist.append(mylist);

In [41]:
len(biglist)
len(biglist[100])

10

In [42]:
unbi_test_x=np.array([np.array(xi) for xi in biglist])
unbi_test_x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [43]:
c_test_x = np.concatenate((test_x_array, unbi_test_x), axis=1) 

In [44]:
test_y = le.transform(test['sentiment'])

In [45]:
predictions = mnb.predict(c_test_x)

In [46]:
count = 0 
for i in range (len(predictions)):
    if predictions[i]==test_y[i]:
        count=count+1
count

892

In [47]:
count/1073

0.8313140726933831

In [48]:
confusion_matrix(test_y, predictions,labels=[0,1])

array([[803,  50],
       [131,  89]])

# BOW - UNI + Unitensified Uni

## Prepare features

In [29]:
ununidf = pd.read_csv("ununi.csv",encoding='ISO-8859-1') 

In [30]:
ununidf.shape

(63, 1)

In [31]:
dict = {}
for p in ununidf['unintensified uni']:
    count = 0
    for text in train_x:
        if p in text:
            count += 1
    dict[p] = count

In [32]:
sorted_dict = sorted(dict.items(), key=operator.itemgetter(1), reverse = True)

In [33]:
top_sorted_dict = sorted_dict[:10]
top_sorted_dict

[('real', 1373),
 ('ok', 650),
 ('right', 388),
 ('fair', 259),
 ('realli', 259),
 ('big', 186),
 ('fun', 162),
 ('obvious', 162),
 ('mean', 137),
 ('liber', 100)]

In [34]:
biglist = []
for i in train_x:
    mylist =[]
    for a,b in top_sorted_dict:
        if a in i:
            mylist.append(1)
        else:
            mylist.append(0)
      
    biglist.append(mylist);

In [35]:
ununi_train_x=np.array([np.array(xi) for xi in biglist])
ununi_train_x

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 1, 0]])

In [36]:
len(biglist)
#len(biglist[100])

9656

In [37]:
c_train_x = np.concatenate((X_train_counts.toarray(), ununi_train_x), axis=1) 

## naive bayes for UNI + Unitensified Uni

In [38]:
mnb = MultinomialNB(alpha=1.0)
mnb.fit(c_train_x,train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [39]:
test['cleantxt'] = test['text'].apply(CleanText)
test['cleantxt'] = test['cleantxt'].apply(removeStopWords)
test_x = count_vect.transform(test['cleantxt'])
test_x_array = test_x.toarray()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [40]:
biglist = []
for i in test['cleantxt']:
    mylist =[]
    for a,b in top_sorted_dict:
        if a in i:
            mylist.append(1)
        else:
            mylist.append(0)
      
    biglist.append(mylist);

In [41]:
len(biglist)
len(biglist[100])

10

In [42]:
ununi_test_x=np.array([np.array(xi) for xi in biglist])
ununi_test_x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [43]:
c_test_x = np.concatenate((test_x_array, ununi_test_x), axis=1) 

In [44]:
test_y = le.transform(test['sentiment'])

In [45]:
predictions = mnb.predict(c_test_x)

In [46]:
count = 0 
for i in range (len(predictions)):
    if predictions[i]==test_y[i]:
        count=count+1
count

889

In [47]:
count/1073

0.8285181733457595

In [48]:
confusion_matrix(test_y, predictions,labels=[0,1])

array([[800,  53],
       [131,  89]])

# UNI + BI + INTENSIFIER + UNINTENSIFIED

# TF Idf's + unigram

In [21]:
tf = TfidfVectorizer(min_df=1,stop_words='english',max_features=5000)

In [22]:
train_x_tfidf = tf.fit_transform(train_x)

In [23]:
train_x_tfidf_array = train_x_tfidf.toarray()

In [24]:
train_x_tfidf_array[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [25]:
tf.inverse_transform(train_x_tfidf_array[0])

[array(['adult', 'bring', 'gopdeb', 'megynkelli', 'picturesshould', 'pose',
        'rt', 'rwsurfergirl'], dtype='<U40')]

# TF Idf's + unigram + bigram

In [282]:
tf_bi = TfidfVectorizer(ngram_range=(1,2),min_df=1,stop_words='english',max_features=5000)

In [283]:
#data = np.array(['blah blah','drink game','exact like','lmfao'])
#s = pd.Series(data)


In [284]:
#featfidf = tf_bi.fit_transform(s)
#featfidf

In [285]:
train_x_tfidf_bi = tf_bi.fit_transform(train_x)
train_x_tfidf_array_bi = train_x_tfidf_bi.toarray()

In [286]:
train_x_tfidf_array_bi

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [279]:
train_x_tfidf_array_bi[0]
len(train_x_tfidf_array_bi[0])

5000

In [276]:
tf_bi.get_feature_names()

['10',
 '10 men',
 '10 million',
 '100',
 '11',
 '12',
 '14',
 '14th',
 '14th amend',
 '15',
 '16',
 '17',
 '19',
 '1st',
 '1st debat',
 '1st gopdeb',
 '20',
 '2012',
 '2013',
 '2015',
 '2015 gopdeb',
 '2016',
 '2016 gopdeb',
 '2016 presidenti',
 '2016elect',
 '21st',
 '21st centuri',
 '22aday',
 '22aday empti',
 '2nd',
 '2nd string',
 '30',
 '30 yrs',
 '32',
 '3rd',
 '3rd parti',
 '40',
 '44',
 '4th',
 '4th amend',
 '50',
 '50th',
 '50th anniversari',
 '5th',
 '70',
 '70 year',
 '700',
 '700 law',
 '90',
 '90 day',
 '911',
 '_hankrearden',
 'abandon',
 'abc',
 'abc cbs',
 'abil',
 'abl',
 'abort',
 'abort ban',
 'abort dont',
 'abort gopdeb',
 'abort question',
 'abort right',
 'abs_tellthetal',
 'abs_tellthetal success',
 'absolut',
 'absolut fear',
 'absurd',
 'abt',
 'accept',
 'access',
 'accident',
 'accident eat',
 'accomplish',
 'accord',
 'account',
 'accus',
 'act',
 'act like',
 'act love',
 'action',
 'actor',
 'actual',
 'actual gopdeb',
 'actual like',
 'actual said',
 'a

In [278]:
len(tf_bi.get_feature_names())
#type(tf_bi.get_feature_names())

list

In [268]:
#Write into csv
with open('bigramfeature.csv', 'w') as f:
    writer = csv.writer(f)
    count = 0
    writer.writerow(["features"])
    for feature in tf_bi.get_feature_names():
        writer.writerow([feature])
        count += 1
count

5000

In [269]:
tf_bi.inverse_transform(train_x_tfidf_array_bi[0])

[array(['adult', 'adult picturesshould', 'bring', 'bring gopdeb', 'gopdeb',
        'gopdeb gopdeb', 'megynkelli', 'megynkelli pose', 'picturesshould',
        'picturesshould bring', 'pose', 'pose adult', 'rt',
        'rt rwsurfergirl', 'rwsurfergirl', 'rwsurfergirl megynkelli'],
       dtype='<U32')]

# Importing Learning models

## naive bayes for bow

In [75]:
mnb = MultinomialNB(alpha=1.0)
mnb.fit(intbi_train_x,train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [28]:
test['cleantxt'] = test['text'].apply(CleanText)
test['cleantxt'] = test['cleantxt'].apply(removeStopWords)
test['cleantxt'] = test['cleantxt'].apply(stemming)
test_x = count_vect.transform(test['cleantxt'])
test_x_array = test_x.toarray()
test_y = le.transform(test['sentiment'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [29]:
predictions = mnb.predict(test_x_array)

In [30]:
count = 0 
for i in range (len(predictions)):
    if predictions[i]==test_y[i]:
        count=count+1
count

1820

In [31]:
count/2146

0.848089468779124

## Multinomial Naive Bayes

In [68]:
from sklearn.naive_bayes import MultinomialNB

In [69]:
mnb = MultinomialNB(alpha=1.0)

In [70]:
mnb.fit(train_x_tfidf_array,train_y)

NameError: name 'train_x_tfidf_array' is not defined

### Preparing the test data

In [49]:
test['cleantxt'] = test['text'].apply(CleanText)
test['cleantxt'] = test['cleantxt'].apply(removeStopWords)
test['cleantxt'] = test['cleantxt'].apply(stemming)
test_x_tfidf = tf.transform(test['cleantxt'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [50]:
test_x_tfidf_array = test_x_tfidf.toarray()

In [51]:
test_y = le.transform(test['sentiment']) 

In [52]:
test_y.shape

(2146,)

In [53]:
test_x_tfidf_array.shape

(2146, 5000)

In [54]:
predictions = mnb.predict(test_x_tfidf_array)

In [55]:
predictions.shape

(2146,)

In [58]:
count = 0 
for i in range (len(predictions)):
    if predictions[i]==test_y[i]:
        count=count+1
count

1787

In [59]:
count/2146

0.8327120223671948

## Multinomial Naive Bayes tf-idf + unigram + bigram

In [244]:
mnb = MultinomialNB(alpha=1.0)
mnb.fit(train_x_tfidf_array_bi,train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [245]:
#prepare test data
test['cleantxt'] = test['text'].apply(CleanText)
test['cleantxt'] = test['cleantxt'].apply(removeStopWords)
test['cleantxt'] = test['cleantxt'].apply(stemming)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [246]:
test_x_tfidf_bi = tf_bi.transform(test['cleantxt'])
test_x_tfidf_array = test_x_tfidf_bi.toarray()
test_y = le.transform(test['sentiment']) 

In [247]:
test_y.shape

(2146,)

In [248]:
test_x_tfidf_array.shape

(2146, 5000)

In [249]:
predictions = mnb.predict(test_x_tfidf_array)

In [250]:
predictions.shape

(2146,)

In [251]:
count = 0 
for i in range (len(predictions)):
    if predictions[i]==test_y[i]:
        count=count+1
count

1811

In [252]:
count/2146

0.8438956197576887

In [253]:
print(predictions)

[0 0 0 ... 0 0 0]


In [70]:
# an example to see the first 100
for i in range (100):
    if predictions[i]!=test_y[i]:
        print(test.iloc[i].text,predictions[i],'  ' , test_y[i])

#GOPDebate had more action than the Manny vs Mayweather fight 0    1
Love how Rubio said the election is not about resumes, but ideas for the future and a high tech future. #GOPDebates 0    1
My response when @megynkelly tried to call @realDonaldTrump a sexist: "CAN'T STUMP THE TRUMP!" #GOPDebate 0    1
RT @RWSurferGirl: Ask Trump a legitimate question. Look at Wallace's face when Trump nails it. ðºð¸ #GOPDebate  #GOPDebates 1    0
RT @4BillLewis: #Facebook &amp; #Twitter made #GOPDebate interesting #jebbush #realdonaldtrump #marcorubio #scottwalker #JohnKasich #realbencarâ¦ 0    1
RT @TheTexasPhoenix: He was on fire! .@GovernorPerry's "On Fire" Debate Performance: https://t.co/0A1w02w7SZ #GOPDebate #Perry2016 0    1
#MarcoRubio speaks from a true American position. He's not perfect, but trying hard, giving it his very best  #FOXNEWSDEBATE #GOPDebates 0    1
#FoxNews was the big winner at #GOPDebate. No way future viewership was not the objective. Surely already said. 0    1
RT @C

In [79]:
#Write into csv
with open('wrong.csv', 'w') as f:
    writer = csv.writer(f)
    count = 0
    writer.writerow(["text","prediction", "sentiment"])
    for i in range (len(predictions)):
        if predictions[i]!=test_y[i]:
            if predictions[i] == 0 and test_y[i] == 1:
                writer.writerow([str(test.iloc[i].text), 'negtive','positive'])
            elif  predictions[i] == 1 and test_y[i] == 0: 
                writer.writerow([str(test.iloc[i].text), 'positive','negtive'])
            count += 1
count

335

## Logistic Regression

In [43]:
from sklearn.linear_model import LogisticRegression # load the library

In [44]:
log_reg = LogisticRegression(C=4.0)

In [45]:
log_reg.fit(train_x_tfidf_array,train_y)



LogisticRegression(C=4.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [46]:
log_reg.score(train_x_tfidf_array,train_y) # running it on the train set itself. 

0.9157637189793778

In [47]:
log_reg.score(test_x_tfidf_array,test_y) # running it on the test set. 

0.858807082945014

## Logistic Regression + unigram + bigram

In [164]:
log_reg = LogisticRegression(C=4.0)

In [165]:
log_reg.fit(train_x_tfidf_array_bi,train_y)



LogisticRegression(C=4.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [166]:
log_reg.score(train_x_tfidf_array_bi,train_y)

0.9149481533263427

In [167]:
log_reg.score(test_x_tfidf_array,test_y)

0.8560111835973905

## SVM

In [79]:
from sklearn import svm

In [80]:
clf = svm.SVC(C=1.0,degree=1,kernel='linear')

In [50]:
clf.fit(train_x_tfidf_array,train_y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=1, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [51]:
predicted = clf.predict(test_x_tfidf_array)

In [52]:
count = 0 
for i in range (len(predicted)):
    if predicted[i]==test_y[i]:
        count=count+1
count

1842

In [54]:
count/2146

0.8583410997204101

## SVM + unigram + bigram

In [81]:
clf = svm.SVC(C=1.0,degree=1,kernel='linear')

In [82]:
clf.fit(train_x_tfidf_array_bi,train_y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=1, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [83]:
predicted = clf.predict(test_x_tfidf_array)

In [84]:
count = 0 
for i in range (len(predicted)):
    if predicted[i]==test_y[i]:
        count=count+1
count

1849

In [85]:
count/2146

0.8616029822926374