In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [2]:
# Passing result_type=’expand’ will expand list-like results to columns of a Dataframe
dataset=pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t')
#processe data
def processing(row):
    #只留下字母，清除部分用' '代替
    review= re.sub('[^a-zA-z]',' ',row['Review'] ).lower().split()
    #词根化
    ps=PorterStemmer()
    #去除虚词
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review=' '.join(review)
    return review

dataset['Review']=dataset.apply(processing,axis=1)
print(dataset.head())
print(len(dataset.index))

                                              Review  Liked
0                                     wow love place      1
1                                         crust good      0
2                                 tasti textur nasti      0
3  stop late may bank holiday rick steve recommen...      1
4                            select menu great price      1
1000


In [3]:
#split data into training and testing
np.random.seed(2019)
dataset['selectMarker']=np.random.uniform(0,1,len(dataset.index))
# uniformly distributed,0.75 train
trainSet=dataset[dataset['selectMarker']<0.75]
testSet=dataset[dataset['selectMarker']>=0.75]
print(trainSet.head())
print(len(trainSet.index))
print(len(testSet.index))

                                              Review  Liked  selectMarker
1                                         crust good      0      0.393081
2                                 tasti textur nasti      0      0.623970
3  stop late may bank holiday rick steve recommen...      1      0.637877
5                            get angri want damn pho      0      0.299172
6                                honeslti tast fresh      0      0.702198
762
238


In [4]:
#P(y=Ci|X=x)=P(y=Ci)*P(X=x|y=Ci)
po=len(trainSet[trainSet['Liked']==1])
against=len(trainSet[trainSet['Liked']==0])
total=po+against
print(po,against)
prior=np.array([against/total,po/total])
print(prior)

384 378
[0.49606299 0.50393701]


In [98]:
#P(y=Ci|X=x)=P(y=Ci)*P(X=x|y=Ci)
#P(y=C|X=x)=P(y=C)*P(x1,x2..xn|y=C)
#1xn=1x2x2xn
def row2vec(row,wordBag):
    vec=[]
    words=set()
    for word in row.split(' '):
        if word not in words:
            words.add(word)
    for word in wordBag:
        if word not in words:
            vec.append(0)
        else:
            vec.append(1)
    return np.array(vec)

def getWordBag(trainSet):
    wordBag=set()
    for row in trainSet['Review']:
        words=set(row.split(' '))
        for word in words:
            if word not in wordBag and word!='':
                wordBag.add(word)
    return list(wordBag)

class classifier:
    def __init__(self,trainSet):
#         self.conditionMatrix=self.getMatrix()
        self.po=len(trainSet[trainSet['Liked']==1])
        self.agianst=len(trainSet[trainSet['Liked']==0])
        self.proWordFrequent={}
        self.againstWordFrequent={}
        self.wordBag=getWordBag(trainSet)
        self.prior=[]
        self.conditonMatrix=[]

    
    def getPrior(self):
        po=self.po
        against=self.agianst
        total=po+against
        print(po,against)
        prior=np.array([against/total,po/total])
        print(prior)
        return prior
    
    def train(self):
        self.prior=self.getPrior()
        wb=self.wordBag
        for row in trainSet[trainSet['Liked']==1]['Review']:
            row=row.split(' ')
            for word in row:
                if word not in self.proWordFrequent:
                    self.proWordFrequent[word]=1/self.po
                else:
                    self.proWordFrequent[word]+=1/self.po
        for row in trainSet[trainSet['Liked']==0]['Review']:
                row=row.split(' ')
                #idx 
                for word in row:
                    if word not in self.againstWordFrequent:
                        self.againstWordFrequent[word]=1/self.agianst
                    else:
                        self.againstWordFrequent[word]+=1/self.agianst
#         print(self.againstWordFrequent)
#         print(self.proWordFrequent)
        def dic2Vec(dic,wb):
            result=[]
            for word in wb:
                if word not in dic.keys():
                    result.append(0)
                else:
                    result.append(dic[word])
            return np.array(result)
        n1=dic2Vec(self.proWordFrequent,wb)
        n2=dic2Vec(self.againstWordFrequent,wb)
        self.conditonMatrix=np.matrix(np.array([n1,n2]))
        print(self.conditonMatrix.shape)
        #[n1,n2]

    
    def predict(self,testRow):
        test=np.transpose(row2vec(testRow,self.wordBag))
        #2xnxnx1
        proProb=1
        againstProb=1
        result=np.dot(self.conditonMatrix,test)
        if np.argmax(result)==0:
            return 1
        else:
            return 0

In [100]:
tmp= classifier(trainSet)
tmp.train()
cnt=0
for i in range (len(testSet['Liked'].index)):
    if tmp.predict(testSet['Review'].iloc[i])==testSet['Liked'].iloc[i]:
        cnt+=1
#     else:
#         print(testSet['Review'].iloc[i],testSet['Liked'].iloc[i],tmp.predict(testSet['Review'].iloc[i]))
print(cnt/len(testSet.index))

384 378
[0.49606299 0.50393701]
(2, 1327)
0.7478991596638656
