In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
from nltk.tokenize import TweetTokenizer
import pickle
import os
from TwitterSentimentAnalyzer import TwitterSentimentAnalyzer

In [2]:
analyzer = TwitterSentimentAnalyzer()

In [3]:
# Importing some data
train=pd.read_csv('./data/trainingandtestdata/training.1600000.processed.noemoticon.csv',encoding='iso-8859-1',header=None)

In [4]:
def load_sentiment_model():
    model_path = './model/SA_model.pickle'
    if not os.path.isfile(model_path):
        train_sentiment_model()
    with open(model_path, 'r') as f:
        return pickle.load(f)
    
def train_sentiment_model():
    model_path = './model/SA_model.pickle'
    train_df = load_training_data('./data/trainingandtestdata/training.1600000.processed.noemoticon.csv')
    
def load_training_data(path, encoding='iso-8859-1', sample_size=40000):
    # Import data from sentiment140 csv file
    train_df=pd.read_csv(path, encoding=encoding, header=None)
    # remove useless columns
    train_df = train_df.loc[:,[0,5]]
    train_df.columns = ['sentiment','text']
    # sample in negative data and positive data
    neg_train_df = train_df[train_df['sentiment'] == 0]
    pos_train_df = train_df[train_df['sentiment'] == 4]
    neg_train_df = neg_train_df.sample(n=sample_size//2)
    pos_train_df = pos_train_df.sample(n=sample_size//2)
    train_df = pd.concat([neg_train_df,pos_train_df])
    train_df['text'] = train_df['text'].str.lower()
    return train_df



In [5]:
train.iloc[:,0].value_counts()

4    800000
0    800000
Name: 0, dtype: int64

In [6]:
train = train.loc[:,[0,5]]
train.columns = ['sentiment','text']

In [7]:
neg_train = train[train['sentiment'] == 0]
pos_train = train[train['sentiment'] == 4]
neg_train = neg_train.sample(n=20000)
pos_train = pos_train.sample(n=20000)
train = pd.concat([neg_train,pos_train])

In [8]:
train['text'] = train['text'].str.lower()

In [9]:
tknzr = TweetTokenizer()
train['tokens'] = train.apply(lambda row: tknzr.tokenize(row['text']),axis=1)

KeyboardInterrupt: 

In [None]:
stop_words = set(stopwords.words('english')+list(string.punctuation)+['...','“','’','”','‘','–'])

In [None]:
def getFreqDist(df):
    freqdist = nltk.FreqDist()
    for i in df.index:
        tokens = df.loc[i,'tokens']
        for token in tokens:
            # skip numbers, @ string, word in stop list and words have quotation
            if token.isdigit() or token in stop_words or token.startswith('@'):
                continue
            freqdist[token] += 1
    return freqdist
# neg_freqdist = getFreqDist(neg_train)
# pos_freqdist = getFreqDist(pos_train)
all_words = getFreqDist(train)

In [None]:
word_features = [w for (w,c) in all_words.most_common(500)]

In [None]:
def document_features(document):
    document_words = set(document['tokens'])
    features = {}
    for word in word_features:
        features['%s)' % word] = (word in document_words)
#     print(features)
    return features
train_set = [(document_features(d),d['sentiment']) for i,d in train.iterrows()]
train_set

In [None]:
clf = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
tweets=pd.read_csv('./data/Tweets.csv')
tweets.columns

In [None]:
tweets['tokens'] = tweets.apply(lambda row: tknzr.tokenize(row['text']),axis=1)
tweets_set=[document_features(d) for i,d in tweets.iterrows()]

In [None]:
comments=pd.read_csv('./data/Comments.csv')

In [None]:
comments['tokens'] = comments.apply(lambda row: tknzr.tokenize(row['text']),axis=1)
comments_set=[document_features(d) for i,d in comments.iterrows()]

In [None]:
senti_tweets = pd.DataFrame(tweets['id'],columns=['id','sentiment'])
for i in senti_tweets.index:
    senti_tweets.loc[i,'sentiment'] = 'pos' if clf.classify(tweets_set[i]) == 4 else 'neg'

In [None]:
senti_comments = pd.DataFrame(comments['id'],columns=['id','sentiment'])
for i in senti_comments.index:
    senti_comments.loc[i,'sentiment'] = 'pos' if clf.classify(comments_set[i]) == 4 else 'neg'

In [None]:
senti_comments

In [None]:
tweets = tweets.merge(senti_tweets)
comments = comments.merge(senti_comments)

In [None]:
comments

In [None]:
pos_count_by_reply = comments[comments['sentiment'] == 'pos']['sentiment'].groupby(comments['reply_id']).count()

In [None]:
pos_count_by_reply