In [None]:
## Imports all necessary modules
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from numpy import linalg
from scipy import stats
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing

%matplotlib inline



In [None]:
## This loads the csv file from disk
yelp_data = pd.read_csv(
    filepath_or_buffer = "./data/Yelp_Usefulness_Assignment2_1.csv", sep = ",", header=0 )

print(yelp_data.head(20))

## Print the dimension of the data
print(yelp_data.shape)

FileNotFoundError: ignored

In [None]:
## Applying descriptive analysis
pd.set_option('display.max_columns', None)

print(yelp_data.describe())

In [None]:
print(yelp_data[["review_id", "class"]].describe())

### 1
###### 1.1
###### 1.1.1

The first problem is missing values, we can use isna() function to detect missing values in each attribute.

In [None]:
## Show the number of data per attributes that has missing values
print(yelp_data.isna().sum())

The report shows there are 5 missing values in the "eigenvector" attribute, then uses the .isna() function again to check the missing values across the tuples.

In [None]:
## Check the missing values across tuples of the "eigenvector" attribute
print(yelp_data["eigenvector"].isna())

###### 1.1.2

In [None]:
## Create box plot to see the distribution of the "eigenvector" attribute
fig = plt.figure(figsize = (6, 6))

plt.boxplot(
    yelp_data["eigenvector"][~yelp_data["eigenvector"].isna()], labels = ["Eigenvector"]
)

plt.tick_params(labelsize = 15)

plt.title("The normalized number of words related to 'eigenvector' sentiment", fontsize = 15)

It seems most values are not zero, and it is negatively skewed. We can use the median central tendency.

In [None]:
## Exclude missing tuples to get median
median = np.median(yelp_data["eigenvector"][~yelp_data["eigenvector"].isna()])
print ("The median is: ", median)

## Replace with median 
yelp_data["eigenvector"].fillna(median, inplace=True)

## Check filled attributes
print(yelp_data.isna().sum())

###### 1.1.3

This approach's pro is not abandoning any data and it has a good chance to hit or close to the actual values. However, the con is choosing median doesn't necessarily give us the advantage if the missing values are evenly or uniform distributed.

###### 1.2
###### 1.2.1

The second problem is outliers, we can use Z-score for outlier detection.

In [None]:
# Use Z-score  to detect ourliers
positions = list(range(1,13))
positions.extend(list(range(14,25)))

z = np.abs(stats.zscore(yelp_data.iloc[:, positions]))
print(z)

In [None]:
threshold = 3
print(np.where(z > 3))

In [None]:
unique_elements, counts_elements = np.unique(np.where(z > 3)[0], return_counts=True)
print(np.asarray((unique_elements, counts_elements)))

[69, 262] - [6,6] is the outlier.

###### 1.2.2

We can remove outliers from dataset by using drop() function.

In [None]:
## Remove largest number of outliers 69, 262.
yelp_data.drop([69, 262])

## Show dimension of data with removed outlier tuples, two tuples are removed.
print(yelp_data.shape)

###### 1.2.3

The pro of this approach is it may eliminate the data errors and improve the data quality. The con of this approach is if the outliers are legitimate, the observations may not as expected and the outcomes may be wrong.

###### 1.3
###### 1.3.1

The third problem is redundancies. We can use correlation analysis to check redundancies.

In [None]:
pcorr = yelp_data.corr(method='pearson')

pd.set_option('display.max_columns', None)
pcorr

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(pcorr, annot=True, cmap=plt.cm.Reds)
plt.show()

###### 1.3.2

We can drop one attribute inside the strongest correlation, which is 0.94 between degree and eigenvector.

In [None]:
yelp_data.drop(["eigenvector"], 1)

###### 1.3.3

The pro of this approach is to reduce the redundancies of the data. The con is abandoned one attribute that may have a significant impact on observation and processing.

### 2

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## This loads the csv file from disk
yelp_data = pd.read_csv(
    filepath_or_buffer = "./data/Yelp_Usefulness_Assignment2_2.csv", sep = ",", header=0 )

print(yelp_data.head(20))

In [None]:
## Create feature matrix by dropping the review_id and label attribute
## Review_id is not going to helpful to predict the usefulness of reviews
X = yelp_data.drop(["review_id","class"], 1)    

## Pre-processing. Sklearn takes integer as label
## Create target attribute
yelp_data[yelp_data['class'] == 'useful'] = 1
yelp_data[yelp_data['class'] == 'not_useful'] = 0

## Specify the data type. Before specifying, the type was unknown
y = yelp_data["class"].astype('int')

## Create a model
clf = LogisticRegression()
clf.fit(X, y)

## predict target class based on the trained model 
predictions = clf.predict(X)

## Calculate the performance of the classifier
accuracy = accuracy_score(predictions, y)

print(accuracy)

###### 2.1

The best accuracy I got is 0.762.

In [None]:
## import the necessary libraries
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

## Apply z-transformation
z_scaler = preprocessing.StandardScaler()
X_scaled = z_scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns = X.columns)

## Sequential Forward Selection(sfs)
sfs = SFS(LogisticRegression(),
           k_features=(1,X_scaled.shape[1]),
           forward=True, 
           floating=True,
           scoring = 'accuracy',
           cv = 0)

sfs = sfs.fit(X_scaled, y)
## Get the final set of features
print(sfs.k_feature_names_)

X_selected = sfs.transform(X_scaled)

# Fit the estimator using the new feature subset
# and make a prediction on the test data
clf.fit(X_selected, y)
predictions = clf.predict(X_selected)

## Calculate the performance of the classifier
accuracy = accuracy_score(predictions, y)

print(accuracy)

###### 2.2

I have applied z-transformation and Min_Max Scaler with different classifier options:  k_features=(1,X_scaled.shape[1])/2/3, foward=True/False, floating=True/False, scoring='accuracy'/'neg_mean_squared_error', cv=0/5/10.\
The overall performance of Min_Max Scaler's accuracy is lower than z-transformation, no matter how I change the arguments in classifiers. The Min_Max Scaler's accuracy is always lower than the baseline classifier 0.752, even set cv to 0.\
The cv is cross-validation schemes. When cv increases, the accuracy is lower and it takes longer to execute. When cv is 0, the other classifiers' arguments no longer affect the accuracy.\
In z-transformation, set forward from False to True, the accuracy decrease.\
In z-transformation, set floating from False to True, the accuracy increase.\
In Min_Max Scaler, set forward from False to True, the accuracy increase.\
In Min_Max Scaler, set floating from False to True, the accuracy increase.

### 3
###### 3.1
###### 3.1.1

Frequent 1-itemsets

|Itemset|Count|Total # of Transactions|Support|Passing Minimum Support|
|------|------|------|------|------|
|Computer|3|6|3/6=0.5|Yes|
|Mouse|2|6|2/6=0.3|No|
|Smart Watch|4|6|4/6=0.67|Yes|
|Tablet|2|6|2/6=0.3|No|
|Smart Phone|3|6|3/6=0.5|Yes|
|Game Console|2|6|2/6=0.3|No|

Frequent 2-itemsets

|Itemset|Count|Total # of Transactions|Support|Passing Minimum Support|
|------|------|------|------|------|
|Computer, Smart Watch|1|6|1/6=0.17|No|
|Computer, Smart Phone|1|6|1/6=0.17|No|
|Smart Watch, Smart Phone|3|6|3/6=0.5|Yes|

Frequent 3-itemsets

|Itemset|Count|Total # of Transactions|Support|Passing Minimum Support|
|------|------|------|------|------|
|Computer, Smart Watch, Smart Phone|1|6|1/6=0.17|No|

###### 3.1.2

|Rule|support (A⇒B)|confidence (A⇒B)|Passing Minimum Support|Passing Minimum Confidence|
|------|------|------|------|------|
|Smart Watch ⇒ Smart Phone|3/6=0.5|3/4=0.75|Yes|Yes|
|Smart Phone ⇒ Smart Watch|3/6=0.5|3/3=1|Yes|Yes|

###### 3.2
###### 3.2.1

Expected Table

||Smart Watch|Not Smart Watch|
|---|---|---|
|Smart Phone|500(850*0.6=510)|350(850*0.4=340)|
|Not Smart Phone|100(150*0.6=90)|50(150*0.4=60)|

|Rule|support (A⇒B)|confidence (A⇒B)|Lift (A⇒B)|chi-squared ( χ2 )|
|------|------|------|------|------|
|Smart Watch ⇒ Smart Phone|500/1000=0.5|500/600=0.83|(500/1000)/((850/1000)*(600/1000))=0.98|(500-510)^2/500+(100-90)^2/100+(500-510)^2/500+(350-340)^2/350=1.69|
|Not Smart Watch ⇒ Smart Phone|350/1000=0.35|350/400=0.875|(350/1000)/((400/1000)*(850/1000))=1.03|(350-340)^2/350+(50-60)^2/50+(500-510)^2/500+(350-340)^2/350=2.77|

###### 3.2.2

Both support and confidence values are larger than 0.35, which means they are reliable rules. Smart Watch ⇒ Smart Phone's lift is lower than 1, it means A and B are negatively correlated. Not Smart Watch ⇒ Smart Phone's lift is larger than 1, it means A and B are positively correlated. chi-squared shows Smart Watch and Smart Phone are negatively related because 510 was expected but only have 500, the Not Smart Watch and Smart Phone are positively related because 90 was expected but have 100. Support and confidence can show if rules are reliable but cannot tell their correlation. Lift can easily calculate the correlation but cannot tell too much difference. Chi-squared is hard to calculate but it shows the correlation clearly. In the future, I will still make a table like this to examine the rules because lift and chi-squared can cross-validate the conclusion.

### 4
###### 4.1

In [1]:
pd.options.mode.chained_assignment = None  # default='warn'

## This loads the csv file from disk
yelp_data = pd.read_csv(
    filepath_or_buffer = "./data/Yelp_Usefulness_Assignment2_2.csv", sep = ",", header=0 )

print(yelp_data.head(20))
print(yelp_data.shape)

NameError: ignored

In [None]:
yelp_data = yelp_data.drop(["review_id"], 1)
yelp_data['class'][yelp_data['class'] == 'useful'] = 1
yelp_data['class'][yelp_data['class'] == 'not_useful'] = 0
yelp_data["class"].astype('int')

In [None]:
for attribute in X:
    yelp_data[attribute] = np.where(yelp_data[attribute] >= np.mean(yelp_data[attribute]),1,0)
    yelp_data[attribute].astype('int')

In [None]:
print(yelp_data.head(20))
print(yelp_data.shape)

###### 4.2

In [None]:
from mlxtend.frequent_patterns import apriori, association_rules

frequent_itemsets = apriori(yelp_data, min_support=0.3, use_colnames=True)

print(frequent_itemsets.head())

In [None]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
print(rules)

In [None]:
rules[ (rules['lift'] > 1) & (rules['confidence'] >= 0.3) ]

###### 4.3

Since lift equals to 1 implies antecedent and consequent are independent, while it produces too many lines, I am using rules['lift'] > 1 as the filter. There are 57 rules that are positively correlated.
One interesting rule is (correct_spell_ratio) ⇒ (FleschReadingEase), it logically makes sense and the lift evaluation measure is larger than 1 which means they are positively correlated.
One not interesting rule is (dislike, review_stars)	⇒ (correct_spell_ratio), the (correct_spell_ratio) should be the antecedents of result (dislike, review_stars), but the evaluation measure lift shows they are positively correlated, although it is very close to 1.
However, from this exercise, I learned how to find association rules from a dataset. The outcome rules are make sense, antecedents and consequents are positively correlated, from both logical and mathematical perspective.

    Q3.1.2 
    
 
    
 
    
 
     Incorrect support scores; please compare them to those you have in Q3.1.2 and Q3.2. Better to list your calculation steps. (-0.1) 
    
 
    
 
    
 
     # 3.2.1 
    
 
    
 
    
 
     Chi-squared scores for both cases should be the same (around 3.27). (-0.1) 
    
 
    
 
    
 
     # 3.2.2 
    
 
    
 
    
 
     Both Chi-squared (less than critical value 3.84) and Lift (around 1) scores indicate that the two are rather indpendent. (-0.2) 