In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
from nltk.tokenize import TweetTokenizer

In [2]:
# Importing some data
train=pd.read_csv('./data/trainingandtestdata/training.1600000.processed.noemoticon.csv',encoding='iso-8859-1',header=None)

In [3]:
train.iloc[:,0].value_counts()

4    800000
0    800000
Name: 0, dtype: int64

In [4]:
train = train.loc[:,[0,5]]
train.columns = ['sentiment','text']

In [5]:
neg_train = train[train['sentiment'] == 0]
pos_train = train[train['sentiment'] == 4]
neg_train = neg_train.sample(n=20000)
pos_train = pos_train.sample(n=20000)
train = pd.concat([neg_train,pos_train])

In [6]:
train['text'] = train['text'].str.lower()

In [7]:
tknzr = TweetTokenizer()
train['tokens'] = train.apply(lambda row: tknzr.tokenize(row['text']),axis=1)

In [8]:
stop_words = set(stopwords.words('english')+list(string.punctuation)+['...','“','’','”','‘','–'])

In [9]:
def getFreqDist(df):
    freqdist = nltk.FreqDist()
    for i in df.index:
        tokens = df.loc[i,'tokens']
        for token in tokens:
            # skip numbers, @ string, word in stop list and words have quotation
            if token.isdigit() or token in stop_words or token.startswith('@'):
                continue
            freqdist[token] += 1
    return freqdist
# neg_freqdist = getFreqDist(neg_train)
# pos_freqdist = getFreqDist(pos_train)
all_words = getFreqDist(train)

In [10]:
word_features = [w for (w,c) in all_words.most_common(500)]

In [19]:
def document_features(document):
    document_words = set(document['tokens'])
    features = {}
    for word in word_features:
        features['%s)' % word] = (word in document_words)
#     print(features)
    return features
train_set = [(document_features(d),d['sentiment']) for i,d in train.iterrows()]
train_set

[({"i'm)": False,
   '..)': False,
   'get)': False,
   'good)': False,
   'day)': False,
   'like)': False,
   'go)': False,
   'work)': False,
   'today)': False,
   'going)': False,
   'love)': False,
   'time)': False,
   'back)': False,
   'got)': False,
   'know)': False,
   'one)': False,
   'u)': False,
   'im)': False,
   'lol)': False,
   'really)': False,
   'night)': False,
   'think)': False,
   'want)': False,
   'see)': False,
   'home)': False,
   "can't)": False,
   'well)': False,
   'thanks)': False,
   'still)': False,
   'new)': False,
   'last)': False,
   'oh)': False,
   'much)': False,
   'miss)': False,
   'tomorrow)': False,
   'great)': False,
   'morning)': False,
   'hope)': False,
   'need)': False,
   'right)': False,
   'haha)': False,
   'bad)': False,
   'sad)': False,
   'fun)': False,
   'sleep)': False,
   'wish)': False,
   'twitter)': False,
   "i'll)": False,
   'feel)': False,
   'would)': False},
  0),
 ({"i'm)": False,
   '..)': False,
   'ge

In [20]:
clf = nltk.NaiveBayesClassifier.train(train_set)

In [37]:
tweets=pd.read_csv('./data/Tweets.csv')
tweets.columns

Index(['id', 'text'], dtype='object')

In [38]:
tweets['tokens'] = tweets.apply(lambda row: tknzr.tokenize(row['text']),axis=1)
tweets_set=[document_features(d) for i,d in tweets.iterrows()]

In [39]:
comments=pd.read_csv('./data/Comments.csv')

In [40]:
comments['tokens'] = comments.apply(lambda row: tknzr.tokenize(row['text']),axis=1)
comments_set=[document_features(d) for i,d in comments.iterrows()]

In [41]:
senti_tweets = pd.DataFrame(tweets['id'],columns=['id','sentiment'])
for i in senti_tweets.index:
    senti_tweets.loc[i,'sentiment'] = 'pos' if clf.classify(tweets_set[i]) == 4 else 'neg'

In [46]:
senti_comments = pd.DataFrame(comments['id'],columns=['id','sentiment'])
for i in senti_comments.index:
    senti_comments.loc[i,'sentiment'] = 'pos' if clf.classify(comments_set[i]) == 4 else 'neg'

In [47]:
senti_comments

Unnamed: 0,id,sentiment
0,1064307262318145537,pos
1,1064307258346143744,pos
2,1064307258329350144,neg
3,1064307258161537025,neg
4,1064307255485575169,pos
5,1064307248820670464,pos
6,1064307247617069057,pos
7,1064307246769815552,pos
8,1064307246170087425,pos
9,1064307246098731008,pos


In [49]:
tweets = tweets.merge(senti_tweets)
comments = comments.merge(senti_comments)

In [50]:
comments

Unnamed: 0,id,reply_id,text,user_id,tokens,sentiment
0,1064307262318145537,1064216956679716864,@realDonaldTrump You mean Matt Shittaker. Loser.,1036316352804519937,"[@realDonaldTrump, You, mean, Matt, Shittaker,...",pos
1,1064307258346143744,1064216956679716864,@realDonaldTrump You are the poster boy for di...,767938862534553600,"[@realDonaldTrump, You, are, the, poster, boy,...",pos
2,1064307258329350144,1064216956679716864,"@realDonaldTrump You do know don't you, that A...",802368694261346304,"[@realDonaldTrump, You, do, know, don't, you, ...",neg
3,1064307258161537025,1064216956679716864,@realDonaldTrump It's so sad to see the POTUS ...,217236028,"[@realDonaldTrump, It's, so, sad, to, see, the...",neg
4,1064307255485575169,1064209246114459648,@realDonaldTrump @FoxNews Will not watch liar,942922512035442693,"[@realDonaldTrump, @FoxNews, Will, not, watch,...",pos
5,1064307248820670464,1064216956679716864,@realDonaldTrump Hey Idiot : The Attorney Gene...,806362158338637824,"[@realDonaldTrump, Hey, Idiot, :, The, Attorne...",pos
6,1064307247617069057,1064245710747590657,@realDonaldTrump Catch and release is not an o...,3064052349,"[@realDonaldTrump, Catch, and, release, is, no...",pos
7,1064307246769815552,1064216956679716864,@realDonaldTrump Thank You President Trump for...,984828184851091456,"[@realDonaldTrump, Thank, You, President, Trum...",pos
8,1064307246170087425,1064245710747590657,@realDonaldTrump There are people that have o...,937724825648410625,"[@realDonaldTrump, There, are, people, that, h...",pos
9,1064307246098731008,1064246787161145346,@realDonaldTrump Love Rick Scott and DJT! Both...,506102465,"[@realDonaldTrump, Love, Rick, Scott, and, DJT...",pos


In [55]:
pos_count_by_reply = comments[comments['sentiment'] == 'pos']['sentiment'].groupby(comments['reply_id']).count()

In [56]:
pos_count_by_reply

reply_id
946531657229701120        1
947235015343202304        2
949287555660500992        1
950937350003183618        1
969529668234829825        1
969532384285687808        1
971726862526361603        1
980762392303980544        1
981859214380462081        1
987278269765517312        1
990227703575056384        7
999246677549768704        1
1003637916919320577       3
1004693718945984512       1
1007278788009480192      10
1007278825661784064       2
1009241032100335616       1
1014151457954811904       1
1019891915402203139       1
1021234525626609666       1
1023538164298858497       1
1023549416446279680       1
1025830647649247232       1
1025833273191264256       1
1026069857589227520       2
1026762818773757955       1
1026781688167366656       1
1027266105797238791       1
1028269087133257728       2
1029354577559281665       1
                       ... 
1065576088779272192     309
1065581119242940416     279
1065583286188158976     137
1065601413189984256     158
10657181096