In [1]:
# Load the library
import csv
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
import seaborn as sns
from nltk.tokenize import word_tokenize
from scipy.spatial import distance
from sklearn.metrics import accuracy_score
import re 
import glob

In [2]:
def tokenize(text):
    # tokenization
    tokens = []
    text = str(text).lower()

    pattern= r'[a-z]+[.''\'-_]*[a-z]+'
    tokensall = nltk.regexp_tokenize(text, pattern)

    stop_words = stopwords.words('english')
    
    for token in tokensall:  
        if token not in stop_words:
            tokens.append(token)  

    return tokens

def sentiment_analysis(text, positive_words, negative_words):
    # count the number of positive words and negative words
    negations=["not", "no", "isn't", "wasn't", "aren't", \
               "weren't", "don't", "didn't", "cannot", \
               "couldn't", "won't", "neither", "nor"]
    
    sentiment = None
    
    tokens=tokenize(text)
    positive_tokens=[]
    negative_tokens=[]
    
    positive_words=[token for token in tokens if token in positive_words]
    negative_words=[token for token in tokens if token in negative_words]
    
    for idx, token in enumerate(positive_words):
        if token in positive_words:
            if idx!=0:
                if tokens[idx-1] not in negative_words:
                    positive_tokens.append(token)
                if tokens[idx-1]  in negative_words:
                    negative_tokens.append(token)
            else:
                positive_tokens.append(token)
                
    for idx, token in enumerate(negative_words):
        if token in negative_words:
            if idx!=0:
                if tokens[idx-1] not in negative_words:
                    negative_tokens.append(token)
                if tokens[idx-1]  in negative_words:
                    positive_tokens.append(token)
            else:
                negative_tokens.append(token)
    
    # compare the number and determine the label
    if len(positive_tokens)>len(negative_tokens):
        sentiment = 1
    if len(positive_tokens)<=len(negative_tokens):
        sentiment = 0
    
    return sentiment


def performance_evaluate(input_file, positive_words, negative_words):
    # test the accuracy of sentiment analysis model
    accuracy = None
    data=input_file
    label=data['sentiment'].tolist()
    review=data['review'].tolist()
    corr_data=list(zip(label,review))
    
    correct_pre = 0

    for label, review in corr_data:
        pre_sentiment=sentiment_analysis(review, positive_words, negative_words)
        if pre_sentiment == label:
            correct_pre+=1
        
    accuracy=correct_pre/len(corr_data)
    
    return accuracy



In [3]:
if __name__ == "__main__":  
    
    # cluster0
    path =r'C:\Users\liulu\OneDrive\Documents\19fall\BIA660\FinalPJ\Reviews\Reviews\Cluster0' # use your path
    allFiles = glob.glob(path + "/*.csv")
    frame = pd.DataFrame()
    list_ = []
    for file_ in allFiles:
        df = pd.read_csv(file_,index_col=None, header=0)
        list_.append(df)
    cluster0 = pd.concat(list_)

    with open("positive-words.txt",'r') as f:
        positive_words=[line.strip() for line in f]
        
    with open("negative-words.txt",'r') as f:
        negative_words=[line.strip() for line in f]
        
    acc=performance_evaluate(cluster0, \
                                  positive_words, negative_words)
    print("\nCluster0 sentiment accuracy: {0:.2f}".format(acc))

    # cluster1
    path =r'C:\Users\liulu\OneDrive\Documents\19fall\BIA660\FinalPJ\Reviews\Reviews\Cluster1' # use your path
    allFiles = glob.glob(path + "/*.csv")
    frame = pd.DataFrame()
    list_ = []
    for file_ in allFiles:
        df = pd.read_csv(file_,index_col=None, header=0)
        list_.append(df)
    cluster1 = pd.concat(list_)

    with open("positive-words.txt",'r') as f:
        positive_words=[line.strip() for line in f]
        
    with open("negative-words.txt",'r') as f:
        negative_words=[line.strip() for line in f]
        
    acc=performance_evaluate(cluster1, \
                                  positive_words, negative_words)
    print("\nCluster1 sentiment accuracy: {0:.2f}".format(acc))    
    
    # cluster2
    path =r'C:\Users\liulu\OneDrive\Documents\19fall\BIA660\FinalPJ\Reviews\Reviews\Cluster2' # use your path
    allFiles = glob.glob(path + "/*.csv")
    frame = pd.DataFrame()
    list_ = []
    for file_ in allFiles:
        df = pd.read_csv(file_,index_col=None, header=0)
        list_.append(df)
    cluster2 = pd.concat(list_)

    with open("positive-words.txt",'r') as f:
        positive_words=[line.strip() for line in f]
        
    with open("negative-words.txt",'r') as f:
        negative_words=[line.strip() for line in f]
        
    acc=performance_evaluate(cluster2, \
                                  positive_words, negative_words)
    print("\nCluster2 sentiment accuracy: {0:.2f}".format(acc))    
   


Cluster0 sentiment accuracy: 0.70

Cluster1 sentiment accuracy: 0.81

Cluster2 sentiment accuracy: 0.65


In [4]:
if __name__ == "__main__":  
    
    # the whole dataset
    df = pd.read_csv(r'C:\Users\liulu\OneDrive\Documents\19fall\BIA660\FinalPJ\review_join_cluster.csv',index_col=None, header=0)

    with open("positive-words.txt",'r') as f:
        positive_words=[line.strip() for line in f]
        
    with open("negative-words.txt",'r') as f:
        negative_words=[line.strip() for line in f]
        
    acc=performance_evaluate(df, \
                                  positive_words, negative_words)
    print("\n sentiment accuracy: {0:.2f}".format(acc))
    


 sentiment accuracy: 0.69
