In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re

In [10]:
def stopword_stemming(df):
    stops = set(stopwords.words("english"))
    stemmer = SnowballStemmer("english")
    
    # Split the sentences to lists of words.
    df['split'] = df['B'].str.strip().str.lower().str.split()
    df = df.drop(columns=['B']) # Get rid of the old column.
        
    df['stopped_stemmed'] = df['split'].apply(lambda x: [stemmer.stem(item) for item in x if item not in stops and not item.startswith('@')])
    df = df.drop(columns=['split']) # Get rid of the old column.
    
    df['B'] =  df['stopped_stemmed'].apply(lambda x: " ".join(x))
    df = df.drop(columns=['stopped_stemmed']) # Get rid of the old column.
    print(df.columns)
    return  df.to_numpy()
        
    

In [3]:
def inputTraining(case):
    df = pd.read_csv("./trainingandtestdata/training.1600000.processed.noemoticon.csv",encoding = "latin-1", header=None, usecols=[0,5],names=['A','B'],index_col=False)
    if case == 1:
        return df.to_numpy()
    elif case == 2:
        return stopword_stemming(df)
    

def inputTesting():
    df = pd.read_csv("./trainingandtestdata/testdata.manual.2009.06.14.csv",encoding = "latin-1", header=None, usecols=[0,5],names=['A','B'],index_col=False)
    data = df.to_numpy()
    data = data[np.where(data[:,0]!=2)]
    return data

In [4]:
# A = Polarity
# B = TweetID
# C = date of the tweet 
# D = query (lyx)
# E = user
# F = tweet

#-----------CLASS------------

#class 0 Positive 
#calss 4 negative

In [5]:
# creating vocabulary of all words in training data

def vocabulary(data):
    data_class0 = data[np.where(data[:,0]==0)]
#     data_class2 = data[np.where(data[:,0]==2)]
    data_class4 = data[np.where(data[:,0]==4)]
    dataClassList  = [data_class0,data_class4]
    vocab_class0 = dict()
    vocab_class2 = dict()
    vocab_class4 = dict()
    TotalVocab = dict()
    wc = []
    vocabClassList = [vocab_class0,vocab_class4]
    for i,j in zip(dataClassList,vocabClassList):
        totalWordCountInClass = 0
        tweetList = i[:,1]
        for tweet in tweetList:
            tweetlist = tweet.replace(','," ").replace('.'," ").split()
#             tweetlist = tweet.split()
            for words in tweetlist:
                totalWordCountInClass+=1
                if words in j:
                    j[words]+=1
                    TotalVocab[words]+=1
                else:
                    j[words]=1
                    TotalVocab[words]=1
        wc.append(totalWordCountInClass)
    return vocab_class0,vocab_class4, TotalVocab,wc
    

In [6]:
# finding parameters

def FindPhi(data):
    phi = np.zeros((2,1))
    phi[0,0] = (np.count_nonzero(data[:,0] == 0)+1)/(data.shape[0]+2)
#     phi[1,0] = (np.count_nonzero(data[:,0] == 2)+1)/(data.shape[0]+3)
    phi[1,0] = (np.count_nonzero(data[:,0] == 4)+1)/(data.shape[0]+2)
    return phi

def Findtheta(vocab_class0,  vocab_class4, TotalVocab,wc):
    #----------------------------------------------------------------------------------------
    theta = dict()
    #--------------------------------------------------------------------------------------------
   
    for words in TotalVocab:
        theta[words] = []
        if(True):
            theta_ofThe_Word = 0
            if words in vocab_class0:
                theta_ofThe_Word = (vocab_class0[words] + 1)/(wc[0] + len(TotalVocab))
            else:
                theta_ofThe_Word = 1/(wc[0] + len(TotalVocab))
            theta[words].append(theta_ofThe_Word)
            
    #---------------------------------------------------------------------------------------------
   
#     for words in TotalVocab:
        if(True):
            theta_ofThe_Word = 0
            if words in vocab_class4:
                theta_ofThe_Word = (vocab_class4[words] + 1)/(wc[1]+ len(TotalVocab))
            else:
                theta_ofThe_Word = 1/(wc[1]+ len(TotalVocab))
            theta[words].append(theta_ofThe_Word)
    #----------------------------------------------------------------------------------------------

    return theta

In [7]:
def testing(TestingData,theta,phi):
    prediction = []
    for tweet in TestingData[:,1]:
        class0 = phi[0,0]
        class4 = phi[1,0]
        tweetlist = tweet.replace(','," ").replace('.'," ").split()
#         tweetlist = tweet.split()
        for words in tweetlist:
            if words in theta:
                class0 += math.log(theta[words][0]) 
            else:
                class0 += math.log(1/(len(tweetlist) + len(theta)))
            
            if words in theta:
                class4 += math.log(theta[words][1]) 
            else :
                class4 += math.log(1/(len(tweetlist) + len(theta)))
        
        class0 += math.log(phi[0,0])
        class4 += math.log(phi[1,0])
        if class0 > class4:
            prediction.append(0)
        else:
            prediction.append(4)
    count =0
    correct_class0 = 0
    correct_class4 = 0
    incorrect_class0 = 0
    incorrect_class4 = 0
    for i in  range(TestingData.shape[0]):
        if prediction[i] == TestingData[i,0]:
            if prediction[i] == 0:
                correct_class0+=1
            else:
                correct_class4+=1
            count+=1
        else:
            if prediction[i]==0:
                incorrect_class4+=1
            else:
                incorrect_class0+=1
    
    confusionMatrix = np.array([[correct_class0,incorrect_class4],[incorrect_class0,correct_class4]])
    Accurarcy = (count/TestingData.shape[0])*100
        
    return Accurarcy,confusionMatrix

In [8]:
def main():
    case = 1
    startTime = time.time()
    TrainingData = inputTraining(2)
    (vocab_class0,vocab_class4, TotalVocab,wc) = vocabulary(TrainingData)
    phi = FindPhi(TrainingData)
    theta  = Findtheta(vocab_class0,  vocab_class4, TotalVocab,wc)
#     print(len(theta)) #count no of words in vocabulary
    TestingData = inputTesting()
    (AccurarcyTest,confusionMatrixTest) = testing(TestingData,theta,phi)
    print("------------------------------PART 1-A-------------------")
    print("Accuracy over Testing data is : ",AccurarcyTest,"%")
    (AccurarcyTrain ,confusionMatrixTrain)= testing(TrainingData,theta,phi)
    print("Accuracy over Training data is : ",AccurarcyTrain,"%")
    print("---------------------------------------------------------\n")
    endTime = time.time()-startTime
    print("-----------------------TIME TAKEN------------------------")
    print("Total time taken is :",endTime)
    print("---------------------------------------------------------\n")
    print("------------------------------PART 1-C-------------------")
    print("CONFUSION MATRIX")
    print(confusionMatrixTest)
    print("---------------------------------------------------------\n")
    

In [11]:
main()
# s= time.time()
# a=inputTraining(2)
# print(time.time() -s)
# a
# for index,value in a.itterrow

Index(['A', 'B'], dtype='object')
------------------------------PART 1-A-------------------
Accuracy over Testing data is :  78.55153203342618 %
Accuracy over Training data is :  80.99275 %
---------------------------------------------------------

-----------------------TIME TAKEN------------------------
Total time taken is : 197.38640069961548
---------------------------------------------------------

------------------------------PART 1-C-------------------
CONFUSION MATRIX
[[126  26]
 [ 51 156]]
---------------------------------------------------------

