In [30]:
import os

import re

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

import numpy as np

  

relativePath=os.getcwd()

rawFilePath=relativePath+"\Reviews.txt" #file containing raw data

processedFilePath=relativePath+"\ProcessedReviews.csv" #file containing processed data

  

def preProcessFile(): # function to preprocess the data

    file=open(rawFilePath)

    writeFile=open(processedFilePath,"w")

    badChar="[,!.?#@=\n]" #list of bad characters to be removed 

    for line in file:

        line=line.lower().replace("\t"," ")# First convert each word to lower case , then replace all tab space with single back space

        line=re.sub(badChar,"",line) # using regular expression remove all bad characters

        arr=line.split(" ")# split the line using space and put all the words into a list

        label=arr[0]#the first word of the list is class label i.e. either Positive or Negative

        words= " ".join(word for word in arr[1:len(arr)]) # rest of the words in the list are joined back to form the original sentence

        toWrite=label+","+words # line to be written: class label

        writeFile.write(toWrite)

        writeFile.write("\n")#after writing every line put new line character.

    file.close()

    writeFile.close()


def getDataAndLabel():

    file = open(processedFilePath)# read the processed file

    label=[]

    data=[]

    for line in file:

        arr=line.replace("\n","").split(",") #split with comma

        label.append(arr[0])#first element is class label

        data.append(arr[1].replace("\n",""))#second element is SMS

    return data,label


def calBaseLine(data): # calculate baseline : it is percentage of records belonging to majority class

    classValues=np.unique(data) # from target values find out unique classes

    highest=0

    baseClass=""

    for label in classValues: # iterate over these classes to find number of records belonging to that class

        count=[i for i in data if i==label ] # create a list containing only label either ham or spam

        count=len(count) #find how many of them are  ham or spam

        if count>highest:

            highest=count

            baseClass=label

    print  ("Base Class :",baseClass)

    print ("base Line :",(float(highest)/len(data))*100)

preProcessFile() #process the file

data,label=getDataAndLabel() #get the data and label

dataTrain, dataTest, labelTrain, labelTest = train_test_split( data, label, test_size=0.25, random_state=45) #split the data and label into training set and test set . 2/3 is for training and 1/3 for testing

print ("Train size :",len(dataTrain),"\nTest Size :",len(dataTest))

count_vect = CountVectorizer() # instance of count vectorize

X_train_counts = count_vect.fit_transform(dataTrain) # create a numerical feature vector

tfidf_transformer = TfidfTransformer() # calculate term frequency

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) #calculate Term Frequency times Inverse Document Frequency

model=MultinomialNB(fit_prior=True) # create an instance of multinomial Naive Bayes

model.fit(X_train_tfidf,labelTrain)# train the model

X_new_counts = count_vect.transform(dataTest)

X_new_tfidf = tfidf_transformer.transform(X_new_counts)#create Term Frequency times Inverse Document Frequency for test data

 
predLabel = model.predict(X_new_tfidf)#predict the test data by using TFID

#print(predLabel)
#print(labelTest)

print ("Accuracy :",np.mean(predLabel==labelTest)*100) #calculte accuracy

calBaseLine(labelTest)

Train size : 3240 
Test Size : 1081
Accuracy : 71.69287696577243
Base Class : positive
base Line : 68.91766882516188
