## data cleaning

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# read in data and view 

data = pd.read_csv("/Users/victoriaguo/Desktop/DS 4002/project 1/original_data.csv")
print(data.info())

In [None]:
# check for NAs

data.isnull().sum()

# fill review text NA values as "missing" instead of empty
data['reviewText']=data['reviewText'].fillna('Missing')

In [None]:
# combine review text and summary columns
data['reviews']=data['reviewText']+data['summary']
data=data.drop(['reviewText', 'summary'], axis=1)

In [None]:
# create sentiment column
data['overall'].value_counts()

def f(row):    
    if row['overall'] == 3.0:
        val = 'Neutral'
    elif row['overall'] == 1.0 or row['overall'] == 2.0:
        val = 'Negative'
    elif row['overall'] == 4.0 or row['overall'] == 5.0:
        val = 'Positive'
    else:
        val = -1
    return val

data['sentiment'] = data.apply(f, axis=1)

In [None]:
# view our data with the added column 
data.head()

data['sentiment'].value_counts()

In [None]:
# drop reviewierID, unixReviewTime, asin columns

data=data.drop(['asin'], axis=1)
data=data.drop(['reviewerID'], axis=1)
data=data.drop(['reviewerName'], axis=1)
data=data.drop(['unixReviewTime'], axis=1)

data.info()

In [None]:
# change reviewTime column to date year format 
date_new = data["reviewTime"].str.split(",", n = 1, expand = True) 

data["date"]= date_new[0] 
data["year"]= date_new[1]

data["year"]= date_new[1] 

data=data.drop(['reviewTime'], axis=1)

In [None]:
# look at our cleaned dataset
data.head()

In [None]:
# create new column with helpfulness rate of a review 
new1 = data["helpful"].str.split(",", n = 1, expand = True)
new2 = new1[0].str.split("[", n = 1, expand = True)
new3 = new1[1].str.split("]", n = 1, expand = True)

#Resetting the index
new2.reset_index(drop=True, inplace=True)
new3.reset_index(drop=True, inplace=True)

#Dropping empty columns due to splitting 
new2=new2.drop([0], axis=1)
new3=new3.drop([1], axis=1)

#Concatenating the splitted columns
helpful=pd.concat([new2, new3], axis=1)

def trim_all_columns(df):
    """
    Trim whitespace from ends of each value across all series in dataframe
    """
    trim_strings = lambda x: x.strip() if isinstance(x, str) else x
    return df.applymap(trim_strings)

#Applying the function
helpful= trim_all_columns(helpful)

#Converting into integer types
helpful[0]=helpful[0].astype(str).astype(int)
helpful[1]=helpful[1].astype(str).astype(int)

#Dividing the two columns, we have 0 in the second columns when dvided gives error, so I'm ignoring those errors
try:
  helpful['result'] = helpful[1]/helpful[0]
except ZeroDivisionError:
  helpful['result']=0

#Filling the NaN values(created due to dividing) with 0
helpful['result'] = helpful['result'].fillna(0)

#Rounding of the results to two decimal places
helpful['result']=helpful['result'].round(2) 

#Attaching the results to a new column of the main dataframe
data['helpful_rate']=helpful['result']

#dropping the helpful column from main dataframe
data=data.drop(['helpful'], axis=1)

In [None]:
# drop date column and keep year
data=data.drop(['date'], axis=1)

In [None]:
# look at dataset 
data.head()

In [None]:
# clean the review column 
def review_cleaning(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
# apply to review column in data
data['reviews']=data['reviews'].apply(lambda x:review_cleaning(x))

data.head()

In [None]:
# drop the neutral reviews (only comparing positive and negative)

indexSentiment = data[(data['sentiment'] == 'Neutral')].index
data.drop(indexSentiment, inplace=True)

In [None]:
# export data to csv

data.to_csv("/Users/victoriaguo/Desktop/DS 4002/project 1/final_data.csv")

## EDA

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.pyplot as plt 
from matplotlib import rcParams
import seaborn as sns
from textblob import TextBlob
from plotly import tools
import plotly.graph_objs as go
from plotly.offline import iplot

In [None]:
data = pd.read_csv("/Users/victoriaguo/Desktop/DS 4002/project 1/final_data.csv")

In [None]:
# generate table of sentiment vs. helpfulness 

pd.DataFrame(data.groupby('sentiment')['helpful_rate'].mean())

In [None]:
# create boxplot that shows sentiment and helpfulness

sns.boxplot( x=data["sentiment"], y=data["helpful_rate"])
plt.title('Sentiment vs Helpfulness')
plt.xlabel('Sentiment categories')
plt.ylabel('Helpful Rate')
plt.show()

In [None]:
# create violin plot because it's hard to tell with boxplot

sns.violinplot( x=data["sentiment"], y=data["helpful_rate"])
plt.title('Sentiment vs Helpfulness')
plt.xlabel('Sentiment categories')
plt.ylabel('Helpful Rate')
plt.show()

In [None]:
# remove 0 values in helpful rate column 

data = data[data['helpful_rate'] != 0.00]

In [None]:
# create new violin plot with removed observations
sns.violinplot( x=data["sentiment"], y=data["helpful_rate"])
plt.title('Sentiment vs Helpfulness')
plt.xlabel('Sentiment categories')
plt.ylabel('Helpful Rate')
plt.show()

In [None]:
# new table with removed 0 values 

pd.DataFrame(data.groupby('sentiment')['helpful_rate'].mean())

In [None]:
# create pie chart 

category_counts = data['sentiment'].value_counts()

# Create the pie chart
plt.pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%')
plt.title('Sentiment Pie Chart')
plt.show()

In [None]:
# data cleaning for bigram plot
stop_words= ['yourselves', 'between', 'whom', 'itself', 'is', "she's", 'up', 'herself', 'here', 'your', 'each', 
             'we', 'he', 'my', "you've", 'having', 'in', 'both', 'for', 'themselves', 'are', 'them', 'other',
             'and', 'an', 'during', 'their', 'can', 'yourself', 'she', 'until', 'so', 'these', 'ours', 'above', 
             'what', 'while', 'have', 're', 'more', 'only', "needn't", 'when', 'just', 'that', 'were', "don't", 
             'very', 'should', 'any', 'y', 'isn', 'who',  'a', 'they', 'to', 'too', "should've", 'has', 'before',
             'into', 'yours', "it's", 'do', 'against', 'on',  'now', 'her', 've', 'd', 'by', 'am', 'from', 
             'about', 'further', "that'll", "you'd", 'you', 'as', 'how', 'been', 'the', 'or', 'doing', 'such',
             'his', 'himself', 'ourselves',  'was', 'through', 'out', 'below', 'own', 'myself', 'theirs', 
             'me', 'why', 'once',  'him', 'than', 'be', 'most', "you'll", 'same', 'some', 'with', 'few', 'it',
             'at', 'after', 'its', 'which', 'there','our', 'this', 'hers', 'being', 'did', 'of', 'had', 'under',
             'over','again', 'where', 'those', 'then', "you're", 'i', 'because', 'does', 'all']


data['reviews'] = data['reviews'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [None]:
# bigram plots 
positive_reviews = data[data["sentiment"]=='Positive'].dropna()
negative_reviews = data[data["sentiment"]=='Negative'].dropna()

def generate_ngrams(text, n_gram=1):
    token = [token for token in text.lower().split(" ") if token != ""]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [" ".join(ngram) for ngram in ngrams]

def horizontal_bar_chart(data, color):
    trace = go.Bar(
        y=data["word"].values[::-1],
        x=data["wordcount"].values[::-1],
        showlegend=False,
        orientation = 'h',
        marker=dict(
            color=color,
        ),
    )
    return trace

from collections import defaultdict

## Get the bar chart from positive reviews ##
freq_dict = defaultdict(int)
for sent in positive_reviews["reviews"]:
    for word in generate_ngrams(sent,2):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace_pos = horizontal_bar_chart(fd_sorted.head(20), 'green')



## Get the bar chart from negative reviews ##
freq_dict = defaultdict(int)
for sent in negative_reviews["reviews"]:
    for word in generate_ngrams(sent,2):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace_neg = horizontal_bar_chart(fd_sorted.head(20), 'red')

In [None]:
# print out two bar charts

plot = tools.make_subplots(rows=2, cols=1, vertical_spacing=0.1, subplot_titles=["Positive Review Words", "Negative Review Words"])
plot.append_trace(trace_pos, 1, 1)
plot.append_trace(trace_neg, 2, 1)

plot['layout'].update(height=1200, width=900, paper_bgcolor='rgb(255, 255, 255)', title="Bigram Plots")

iplot(plot, filename='bigram')

## Analysis Code

In [None]:
import numpy as np
import pandas as pd 
from scipy.stats import ttest_ind

In [None]:
# read in data 
data = pd.read_csv("/Users/victoriaguo/Desktop/DS 4002/project 1/final_data.csv")

In [None]:
# split data into two groups: positive and negative
positive = data[data['sentiment'] == 'Positive']['helpful_rate']
negative = data[data['sentiment'] == 'Negative']['helpful_rate']

In [None]:
# test for normality: kolmogorov-smirnov test
from scipy.stats import kstest

statistic, p_value = kstest(data['helpful_rate'], 'norm')

print("Kolmogorov-Smirnov Test:")
print("Statistic:", statistic)
print("P-value:", p_value)

In [None]:
# test for constant variance: bartlett's 
from scipy.stats import bartlett

statistic, p_value = bartlett(positive, negative)
print("Bartlett's Test:")
print("Statistic:", statistic)
print("P-value:", p_value)

In [None]:
# perform independent t tests
t_statistic, p_value = ttest_ind(positive, negative)

print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

In [None]:
# create new dataframe with sentiment as columns

new = pd.DataFrame()

new['Positive'] = data.loc[data['sentiment'] == 'Positive', 'helpful_rate'].reset_index(drop=True)

new['Negative'] = data.loc[data['sentiment'] == 'Negative', 'helpful_rate'].reset_index(drop=True)

In [None]:
# remove 0 values
new = new[new['Positive'] != 0.00]
new = new[new['Negative'] != 0.00]

new.head()

In [None]:
# because the assumptions above were not met, we performed a non-parametric test to validate the conclusions (no assumptions about data necessary)
from scipy.stats import wilcoxon

# Assuming 'data' is your paired data
# Perform Wilcoxon signed-rank test
statistic, p_value = wilcoxon(new)
print("Wilcoxon Signed-Rank Test:")
print("Statistic:", statistic)
print("P-value:", p_value)