In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re

# Tokenization, Stopping and Stemming

In [None]:
def stopword_stemming(df):
    stops = set(stopwords.words("english"))
    stemmer = SnowballStemmer("english")
    
    # Split the sentences to lists of words.
    df['split'] = df['B'].str.strip().str.lower().str.replace(","," ").str.replace("."," ").str.replace("!","").str.split()
    df = df.drop(columns=['B']) # Get rid of the old column.
        
    df['stopped_stemmed'] = df['split'].apply(lambda x: [stemmer.stem(item) for item in x if item not in stops and not item.startswith('@')])
    df = df.drop(columns=['split']) # Get rid of the old column.
    
    df['B'] =  df['stopped_stemmed'].apply(lambda x: " ".join(x))
    df = df.drop(columns=['stopped_stemmed']) # Get rid of the old column.
#     return  df.to_numpy()
    return df 
        
    

# Input: training and testing data

In [None]:
def inputTraining(case):
    df = pd.read_csv("./trainingandtestdata/training.1600000.processed.noemoticon.csv",encoding = "latin-1", header=None, usecols=[0,5],names=['A','B'],index_col=False)
    if case == 1:
        return df.to_numpy()
    elif case == 2:
        return stopword_stemming(df)
    

def inputTesting():
    df = pd.read_csv("./trainingandtestdata/testdata.manual.2009.06.14.csv",encoding = "latin-1", header=None, usecols=[0,5],names=['A','B'],index_col=False)
    data = df.to_numpy()
    data = data[np.where(data[:,0]!=2)]
    return data

# Description of Data

In [None]:
# A = Polarity
# B = TweetID
# C = date of the tweet 
# D = query (lyx)
# E = user
# F = tweet

#-----------CLASS------------

#class 0 Positive 
#calss 4 negative

# Creating Vocabulary

In [None]:
# creating vocabulary of all words in training data

def vocabulary(data):
    data_class0 = data[np.where(data[:,0]==0)]
#     data_class2 = data[np.where(data[:,0]==2)]
    data_class4 = data[np.where(data[:,0]==4)]
    dataClassList  = [data_class0,data_class4]
    vocab_class0 = dict()
    vocab_class2 = dict()
    vocab_class4 = dict()
    TotalVocab = dict()
    wc = []
    vocabClassList = [vocab_class0,vocab_class4]
    for i,j in zip(dataClassList,vocabClassList):
        totalWordCountInClass = 0
        tweetList = i[:,1]
        for tweet in tweetList:
            tweetlist = tweet.replace(","," ").replace("."," ").split()
#             tweetlist = tweet.split()
            for words in tweetlist:
                totalWordCountInClass+=1
                if words in j:
                    j[words]+=1
                    TotalVocab[words]+=1
                else:
                    j[words]=1
                    TotalVocab[words]=1
        wc.append(totalWordCountInClass)
    print("Vocalbulary done")
    return vocab_class0,vocab_class4, TotalVocab,wc
    

# Finding the paarameters

In [None]:
# finding parameters

def FindPhi(data):
    phi = np.zeros((2,1))
    phi[0,0] = (np.count_nonzero(data[:,0] == 0)+1)/(data.shape[0]+2)
#     phi[1,0] = (np.count_nonzero(data[:,0] == 2)+1)/(data.shape[0]+3)
    phi[1,0] = (np.count_nonzero(data[:,0] == 4)+1)/(data.shape[0]+2)
    return phi

def Findtheta(vocab_class0,  vocab_class4, TotalVocab,wc):
    #----------------------------------------------------------------------------------------
    theta = dict()
    #--------------------------------------------------------------------------------------------
   
    for words in TotalVocab:
        theta[words] = []
        if(True):
            theta_ofThe_Word = 0
            if words in vocab_class0:
                theta_ofThe_Word = (vocab_class0[words] + 1)/(wc[0] + len(TotalVocab))
            else:
                theta_ofThe_Word = 1/(wc[0] + len(TotalVocab))
            theta[words].append(theta_ofThe_Word)
            
    #---------------------------------------------------------------------------------------------
   
#     for words in TotalVocab:
        if(True):
            theta_ofThe_Word = 0
            if words in vocab_class4:
                theta_ofThe_Word = (vocab_class4[words] + 1)/(wc[1]+ len(TotalVocab))
            else:
                theta_ofThe_Word = 1/(wc[1]+ len(TotalVocab))
            theta[words].append(theta_ofThe_Word)
    #----------------------------------------------------------------------------------------------
    print("parameters calculated")
    return theta

# Testing: Accuracy and Confusion Matrix

In [None]:
def testing(TestingData,theta,phi,case,test):
    prediction = []
    for tweet in TestingData[:,1]:
        class0 = phi[0,0]
        class4 = phi[1,0]
        
        #TESTING DATA
        if test == 0:
            if case ==1:
                
                tweetlist = tweet.replace(','," ").replace('.'," ").split()
            
            elif case ==2:
                stops = set(stopwords.words("english"))
                stemmer = SnowballStemmer("english")
                tweetlist =  [stemmer.stem(item) for item in tweet.strip().lower().replace("!","").replace(","," ").replace("."," ").split() if item not in stops and not item.startswith('@')]
       
        # TRAINING DATA
        if test == 1:
            tweetlist = tweet.split()
         
        #CALCULATE PROBABILTY
        for words in tweetlist:
            if words in theta:
                class0 += math.log(theta[words][0]) 
            else:
                class0 += math.log(1/(len(tweetlist) + len(theta)))
            
            if words in theta:
                class4 += math.log(theta[words][1]) 
            else :
                class4 += math.log(1/(len(tweetlist) + len(theta)))
        
        class0 += math.log(phi[0,0])
        class4 += math.log(phi[1,0])
        if class0 > class4:
            prediction.append(0)
        else:
            prediction.append(4)
    count =0
    correct_class0 = 0
    correct_class4 = 0
    incorrect_class0 = 0
    incorrect_class4 = 0
    for i in  range(TestingData.shape[0]):
        if prediction[i] == TestingData[i,0]:
            if prediction[i] == 0:
                correct_class0+=1
            else:
                correct_class4+=1
            count+=1
        else:
            if prediction[i]==0:
                incorrect_class4+=1
            else:
                incorrect_class0+=1
    
    confusionMatrix = np.array([[correct_class0,incorrect_class4],[incorrect_class0,correct_class4]])
    Accurarcy = (count/TestingData.shape[0])*100
        
    return Accurarcy,confusionMatrix

# Function of structurally call the functions:  Part A B C D

In [None]:
def main():
    #test 0 represents testing data
    #test 1 represents training data
    case = 2
    startTime = time.time()
    TrainingDataFrame = inputTraining(case)
    TrainingData = TrainingDataFrame.to_numpy()
    
    #--------------------------------------------------------------------------
    #creating pickle of the clean data:
    import pickle
    with open("cleanData",'wb') as f:
        pickle.dump(TrainingDataFrame, f, protocol=pickle.HIGHEST_PROTOCOL)
    #--------------------------------------------------------------------------
    
    (vocab_class0,vocab_class4, TotalVocab,wc) = vocabulary(TrainingData)
    phi = FindPhi(TrainingData)
    theta  = Findtheta(vocab_class0,  vocab_class4, TotalVocab,wc)
    TestingData = inputTesting()
    #TESTING DATA
    (AccurarcyTest,confusionMatrixTest) = testing(TestingData,theta,phi,case,0)
    print("------------------------------PART 1-A-------------------")
    print("Accuracy over Testing data is : ",AccurarcyTest,"%")
    #TRAINING DATA
    (AccurarcyTrain ,confusionMatrixTrain)= testing(TrainingData,theta,phi,case,1)
    print("Accuracy over Training data is : ",AccurarcyTrain,"%")
    print("---------------------------------------------------------\n")
    endTime = time.time()-startTime
    print("-----------------------TIME TAKEN------------------------")
    print("Total time taken is :",endTime)
    print("---------------------------------------------------------\n")
    print("------------------------------PART 1-C-------------------")
    print("CONFUSION MATRIX FOR TEST DATA: ")
    print(" Actual \t\t Class 0 \t Class 1")
    print("predicted Class 0 \t",confusionMatrixTest[0,0],"\t\t",confusionMatrixTest[0,1])
    print("predicted Class 1 \t",confusionMatrixTest[1,0],"\t\t",confusionMatrixTest[1,1])
    print("\n")
#     print("CONFUSION MATRIX FOR TRAIN DATA: ")
#     print(" Actual \t\t Class 0 \t Class 1")
#     print("predicted Class 0 \t",confusionMatrixTrain[0,0],"\t\t",confusionMatrixTrain[0,1])
#     print("predicted Class 1 \t",confusionMatrixTrain[1,0],"\t\t",confusionMatrixTrain[1,1])
#     print(confusionMatrixTest)
    print("---------------------------------------------------------\n")
    

In [None]:
main()


# Part F: TF-IDF

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from sklearn.naive_bayes import GaussianNB
import numpy as np
from tqdm import tqdm

In [2]:
with open('cleanData','rb') as f:
    data=pickle.load(f)

vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(data['B'])
# print(vectorizer.get_feature_names())
print(tfidf.shape)


(1600000, 322119)


In [3]:
print(data['B'][1599999])
print(tfidf[1599999])

happi #charitytuesday
  (0, 140104)	0.39456621190909763
  (0, 81596)	0.9188675118969574


In [4]:
clf_pf = GaussianNB()

for i in tqdm(range(0,1600000,1000)):
    clf_pf.partial_fit(tfidf[i:i+1000].toarray(), data['A'][i:i+1000], np.array([0,4]))
    
# clf_pf = GaussianNB()
# clf_pf.partial_fit(X, Y, np.unique(Y))


  3%|▎         | 50/1600 [03:55<2:06:05,  4.88s/it]

KeyboardInterrupt: 