## Import Required Libraries

In [None]:
# Import Required Librarues
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import statsmodels as sm

pd.set_option('max_columns',500)
pd.set_option('max_rows',10000000)


## Import the data

In [None]:
# Import the data in data variable
data=pd.read_csv('sample30.csv')
data.head(3)

In [None]:
# shape of the data
data.shape

In [None]:
# Null value check on each column
data.isnull().sum()

In [None]:
((data.isnull().sum()/data.shape[0])*100).round(2)

## Insights:
- We have three features where null vlaue percent is more than 40 percent lets drop them


In [None]:
data.drop(['reviews_didPurchase','reviews_userCity','reviews_userProvince'],axis=1,inplace=True)

In [None]:
((data.isnull().sum()/data.shape[0])*100).round(2)

## Insights:
- Except one column all the columns wherever null vlaues are present, percentage of null vaues is less than 1 percentage

In [None]:
data.head(2)

In [None]:
data.shape

## Replace nan values

In [None]:
data['reviews_username'].replace(np.NaN,data['reviews_username'].value_counts().index[0],inplace=True)

In [None]:
data['manufacturer'].replace(np.NaN,data['manufacturer'].value_counts().index[0],inplace=True)
data['reviews_date'].replace(np.NaN,data['reviews_date'].value_counts().index[0],inplace=True)
data['reviews_title'].replace(np.NaN,data['reviews_title'].value_counts().index[0],inplace=True)

In [None]:
## Categorical null values are replaced with maximum occuring values over that column/feature

In [None]:
data.head()

In [None]:
data['reviews_doRecommend'].replace(np.NaN,data['reviews_doRecommend'].value_counts().index[0],inplace=True)

In [None]:
data['reviews_doRecommend'].value_counts()

In [None]:
((data.isnull().sum()/data.shape[0])*100).round(2)

In [None]:
data.isnull().sum()/data.shape[0]

In [None]:
data.head(2)

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data[data['user_sentiment'].isnull()]

In [None]:
data.dropna(how='any',axis=0,inplace=True)

In [None]:
data.reset_index(drop=True,inplace=True)

In [None]:
data.info()

# id,brand,categories,manufacturer,reviewdate can be eliminated

## Lets understand each Feature

In [None]:
len(data['id'].unique())

In [None]:
len(data['brand'].unique())

In [None]:
for i in data.columns:
  print("Unique of %s is %d"%(i,len(data[i].unique())))

In [None]:
data.head(1)

In [None]:
# We use normalizeto visuaise the data in percentage fomrat
data['reviews_doRecommend'].value_counts(normalize=True).plot(kind='bar')
plt.show()

In [None]:
data['reviews_rating'].value_counts(normalize=True).plot(kind='bar')
plt.show()

In [None]:
data['user_sentiment'].value_counts().plot(kind='bar')
plt.show()

In [None]:
data['user_sentiment'].value_counts()

In [None]:
data['user_sentiment'].value_counts(normalize=True).plot(kind='bar')
plt.show()

## Lets create the data suitable for sentiment analysis so we can reviews_text, reviews_title and user_sentiment

In [None]:
data2=data[:]
data2.head()

In [None]:
data2['reviews']=data2['reviews_title']+' '+data2['reviews_text']
data2.head()

In [None]:
data2['usersentiment']=data2['user_sentiment'].replace({'Positive':1,'Negative':0})
data2.head(3)

In [None]:
data2.info()

In [None]:
data3=data2[:]#[['reviews','usersentiment']]
data3.head()

In [None]:
data3['usersentiment'].value_counts(normalize=True)

## Data cleaning:
- punctuation removal
- stopword removal
- common wrd
- stemming and lemmatisation
- regex can be used to cover majroity of the things
- Stemming and lemmatisation

## Convert text to tokens
- bow
- tf-idf
- word2vec
- glove embeddings
- count vectoriser

In [None]:
pd.set_option('max_colwidth',200)
data3['reviews'].head()

## Data Cleaning

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
import re

ps=PorterStemmer()
lm=WordNetLemmatizer()

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,classification_report

cv=CountVectorizer()
tf=TfidfVectorizer(max_features=5000,ngram_range=(1,3))

## Remove the reviews having length less than 250 words

In [None]:
data.head()

In [None]:
corpus=[] # empty corpus

# Clean the data as shown below
for sent in data3['reviews']:
    
    # 1. Lower the sentences
    sent=sent.lower()
    
    #2.regex statement
    sent=re.sub('[^a-zA-Z0-9]',' ',sent)
    
    #3. remove extra spaces in the text
    sent=sent.split()
    sent=' '.join(sent)
    
    #4. Split the data
    sent=sent.split()
    
    #5.use regex to remove unnecessary charatcers in the data
    sent=[lm.lemmatize(word) for word in sent if word not in set(stopwords.words('english'))]
    
    #6. Append the data into corpus and now the sentence is traeted with stopwords and lemmatization
    corpus.append(' '.join(sent))

In [None]:
corpus[:10]

In [None]:
data3['reviews_cleaned']=corpus
data3.head()

In [None]:
data3.info()

In [None]:
l=[]
for i in data3['reviews_cleaned']:
    l.append((len(i.split())))

In [None]:
np.quantile(l,[0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.99,0.995,0.997,0.999,1])

In [None]:
data3.info()

In [None]:
data3.index

In [None]:
for i in data3['reviews_cleaned']:
    print(i)
    print('---')

In [None]:
counter=len(data3['reviews_cleaned'])
counter

In [None]:
for i,j in zip(range(counter+1),data3['reviews_cleaned']):
    print(i)
    print(j)
    print(data3['reviews_cleaned'][i])

In [None]:
indexes=[]

In [None]:
# Remove rows where the reviews length is greater than 150 words
for i,j in zip(range(counter+1),data3['reviews_cleaned']):
    if(len(j.split())>150):  #comments greater than 150 words
        if(j == (data3['reviews_cleaned'][i])):
            indexes.append(i)             

In [None]:
print(indexes)

In [None]:
data3.info()

In [None]:
data3.drop(indexes,inplace=True)

In [None]:
data3.reset_index(inplace=True)

In [None]:
data3.info()

## We hvae removed reviews where word count i sgreater than 150

In [None]:
x=data3['reviews_cleaned']
x.head()

In [None]:
y=data3['usersentiment']
y.head()

## Remove the reviews having length less than 250 words

In [None]:
y.shape

In [None]:
x.shape

In [None]:
# TF-IDF: term frequency inverse document frequency
x=tf.fit_transform(x).toarray()
x

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42)

## From initial analysis we know that data is imbalanced ets treat them using calss imbalance Techniques

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
## Its a clear class imbalance 

In [None]:
import imblearn
from imblearn.combine import SMOTETomek

In [None]:
import pandas as pd

In [None]:
pd.set_option('max_colwidth',200)

In [None]:
x_train

In [None]:
from imblearn.over_sampling import SMOTE
smt = SMOTE(0.5,random_state=42)
x_train_SMOTE, y_train_SMOTE = smt.fit_sample(x_train, y_train)

In [None]:
print(x_train_SMOTE.shape)
print(y_train_SMOTE.shape)

In [None]:
y_train.value_counts()

In [None]:
y_train_SMOTE.value_counts()

## Naivebayes after smote analysis

In [None]:
from sklearn.naive_bayes import BernoulliNB,MultinomialNB

In [None]:
nbc=BernoulliNB()

In [None]:
nbc.fit(x_train_SMOTE,y_train_SMOTE)

In [None]:
y_pred=nbc.predict(x_test)
y_pred

In [None]:
print(accuracy_score(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test,y_pred))


In [None]:
print(classification_report(y_test, y_pred))

In [None]:
nbc=MultinomialNB()

In [None]:
nbc.fit(x_train_SMOTE,y_train_SMOTE)

In [None]:
y_pred=nbc.predict(x_test)
y_pred

In [None]:
print(accuracy_score(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test,y_pred))


In [None]:
print(classification_report(y_test, y_pred))

## Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()

In [None]:
lr.fit(x_train_SMOTE,y_train_SMOTE)

In [None]:
y_pred=lr.predict(x_test)
y_pred

In [None]:
print(accuracy_score(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test,y_pred))


In [None]:
print(classification_report(y_test, y_pred))

## random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
rc=RandomForestClassifier(random_state=42,n_jobs=-1)

In [None]:
# Create the parameter grid based on the results of random search 
params = {
    'max_depth': [1, 2, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'max_features': [2,3,4],
    'n_estimators': [10, 30, 50, 100, 200]
}

In [None]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rc, param_grid=params, 
                          cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")

In [None]:
%%time
grid_search.fit(x_train_SMOTE,y_train_SMOTE)

In [None]:
rf_best = grid_search.best_estimator_
rf_best

In [None]:
rc=RandomForestClassifier(max_depth=20, max_features=4, min_samples_leaf=5,n_estimators=10, n_jobs=-1, random_state=42)
rc

In [None]:
rc.fit(x_train_SMOTE,y_train_SMOTE)

In [None]:
y_pred=rc.predict(x_test)
y_pred

In [None]:
print(accuracy_score(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test,y_pred))


In [None]:
print(classification_report(y_test, y_pred))

In [None]:
data3.info()

In [None]:
data3.head()

## Recommendation

In [None]:
# Content based
# collabarative (useruser based and itemitem based using csiine similarity)

In [None]:
## Lets create a new dataframe called data4 for recommendation algorithm building

In [None]:
data4=data3[:]
data4.head(3)

In [None]:
## user-user based

In [None]:
# Test and Train split of the dataset.
from sklearn.model_selection import train_test_split
train, test = train_test_split(data4, test_size=0.2, random_state=42)

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.head(3)

In [None]:
test.head(1)

In [None]:
data4['reviews_rating'].value_counts()

In [None]:
# Create a pivot table and see how rating is provided by each user by different products
df_pivot = train.pivot_table(
    index='reviews_username',
    columns='name',
    values='reviews_rating'
).fillna(0)

df_pivot.head(3)

In [None]:
## craeting dummy train and dummy test dataset

### Creating dummy train & dummy test dataset
These dataset will be used for prediction 
- Dummy train will be used later for prediction of the movies which has not been rated by the user. To ignore the movies rated by the user, we will mark it as 0 during prediction. The movies not rated by user is marked as 1 for prediction in dummy train dataset. 

- Dummy test will be used for evaluation. To evaluate, we will only make prediction on the movies rated by the user. So, this is marked as 1. This is just opposite of dummy_train.

In [None]:
# Copy the train dataset into dummy_train
dummy_train = train.copy()

In [None]:
# The movies not rated by user is marked as 1 for prediction. 
dummy_train['reviews_rating'] = dummy_train['reviews_rating'].apply(lambda x: 0 if x>=1 else 1)

In [None]:
dummy_train['reviews_rating'].value_counts()

In [None]:
# Convert the dummy train dataset into matrix format.
dummy_train = dummy_train.pivot_table(
    index='reviews_username',
    columns='name',
    values='reviews_rating'
).fillna(1)

In [None]:
dummy_train.head(5)

**Cosine Similarity**

Cosine Similarity is a measurement that quantifies the similarity between two vectors [Which is Rating Vector in this case] 

**Adjusted Cosine**

Adjusted cosine similarity is a modified version of vector-based similarity where we incorporate the fact that different users have different ratings schemes. In other words, some users might rate items highly in general, and others might give items lower ratings as a preference. To handle this nature from rating given by user , we subtract average ratings for each user from each user's rating for different movies.



# User Similarity Matrix

## Using Cosine Similarity

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

# Creating the User Similarity Matrix using pairwise_distance function.
user_correlation = 1 - pairwise_distances(df_pivot, metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation)

In [None]:
user_correlation.shape

In [None]:
dummy_train.head()

## Using adjusted Cosine 

### Here, we are not removing the NaN values and calculating the mean only for the movies rated by the user

In [None]:
# Create a user-movie matrix.
df_pivot = train.pivot_table(
    index='reviews_username',
    columns='name',
    values='reviews_rating'
)

In [None]:
df_pivot.tail(10)

### Normalising the rating of the movie for each user around 0 mean

In [None]:
mean = np.nanmean(df_pivot, axis=1)
df_subtracted = (df_pivot.T-mean).T

In [None]:
df_subtracted.head()

In [None]:
df_subtracted.tail(10)

### Finding cosine similarity

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

In [None]:
# Creating the User Similarity Matrix using pairwise_distance function.
user_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation)

## Prediction - User User

Doing the prediction for the users which are positively related with other users, and not the users which are negatively related as we are interested in the users which are more similar to the current users. So, ignoring the correlation for values less than 0. 

In [None]:
user_correlation[user_correlation<0]=0
user_correlation

Rating predicted by the user (for movies rated as well as not rated) is the weighted sum of correlation with the movie rating (as present in the rating dataset). 

In [None]:
user_predicted_ratings = np.dot(user_correlation, df_pivot.fillna(0))
user_predicted_ratings

In [None]:
user_predicted_ratings.shape

Since we are interested only in the movies not rated by the user, we will ignore the movies rated by the user by making it zero. 

In [None]:
user_final_rating = np.multiply(user_predicted_ratings,dummy_train)
user_final_rating.head()

In [None]:
user_final_rating.tail(10)

### Finding the top 5 recommendation for the *user*

In [None]:
# Take the user ID as input.
user_input = input("Enter your user name ")
print(user_input)

In [None]:
user_final_rating.head(2)

In [None]:
user_final_rating.tail()

In [None]:
d = user_final_rating.loc[user_input].sort_values(ascending=False)[0:20]
d

# Evaluation - User User 

Evaluation will be same as you have seen above for the prediction. The only difference being, you will evaluate for the movie already rated by the user insead of predicting it for the movie not rated by the user. 

In [None]:
test.head(2)

In [None]:
# Find out the common users of test and train dataset.
common = test[test.reviews_username.isin(train.reviews_username)]
common.shape

In [None]:
common.head(1)

In [None]:
# convert into the user-movie matrix.
common_user_based_matrix = common.pivot_table(index='reviews_username', columns='name', values='reviews_rating')
common_user_based_matrix.head(2)

In [None]:
# Convert the user_correlation matrix into dataframe.
user_correlation_df = pd.DataFrame(user_correlation)

In [None]:
df_subtracted.head(1)

In [None]:
user_correlation_df['reviews_username'] = df_subtracted.index
user_correlation_df.set_index('reviews_username',inplace=True)
user_correlation_df.head()

In [None]:
common.head(1)

In [None]:
list_name = common.reviews_username.tolist()

user_correlation_df.columns = df_subtracted.index.tolist()


user_correlation_df_1 =  user_correlation_df[user_correlation_df.index.isin(list_name)]

In [None]:
user_correlation_df.columns

In [None]:
list_name

In [None]:
user_correlation_df.head(1)

In [None]:
user_correlation_df_1.shape

In [None]:
user_correlation_df_2 = user_correlation_df_1.T[user_correlation_df_1.T.index.isin(list_name)]

In [None]:
user_correlation_df_3 = user_correlation_df_2.T

In [None]:
user_correlation_df_3.head()

In [None]:
user_correlation_df_3.shape

In [None]:
user_correlation_df_3[user_correlation_df_3<0]=0

common_user_predicted_ratings = np.dot(user_correlation_df_3, common_user_based_matrix.fillna(0))
common_user_predicted_ratings

In [None]:
dummy_test = common.copy()

dummy_test['reviews_rating'] = dummy_test['reviews_rating'].apply(lambda x: 1 if x>=1 else 0)

dummy_test = dummy_test.pivot_table(index='reviews_username', columns='name', values='reviews_rating').fillna(0)

In [None]:
dummy_test.shape

In [None]:
common_user_predicted_ratings = np.multiply(common_user_predicted_ratings,dummy_test)

In [None]:
common_user_predicted_ratings.head(2)

Calculating the RMSE for only the movies rated by user. For RMSE, normalising the rating to (1,5) range.

In [None]:
from sklearn.preprocessing import MinMaxScaler
from numpy import *

X  = common_user_predicted_ratings.copy() 
X = X[X>0]

scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))

print(y)

In [None]:
common_ = common.pivot_table(index='reviews_username', columns='name', values='reviews_rating')

In [None]:
# Finding total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(y))

In [None]:
rmse = (sum(sum((common_ - y )**2))/total_non_nan)**0.5
print(rmse)

## Using Item similarity

# Item Based Similarity

Taking the transpose of the rating matrix to normalize the rating around the mean for different movie ID. In the user based similarity, we had taken mean for each user instead of each movie. 

In [None]:
train.head(1)

In [None]:
df_pivot = train.pivot_table(
    index='reviews_username',
    columns='name',
    values='reviews_rating'
).T

df_pivot.head()

Normalising the movie rating for each movie for using the Adujsted Cosine

In [None]:
mean = np.nanmean(df_pivot, axis=1)
df_subtracted = (df_pivot.T-mean).T

In [None]:
df_subtracted.head()

Finding the cosine similarity using pairwise distances approach

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

In [None]:
# Item Similarity Matrix
item_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
item_correlation[np.isnan(item_correlation)] = 0
print(item_correlation)

Filtering the correlation only for which the value is greater than 0. (Positively correlated)

In [None]:
item_correlation[item_correlation<0]=0
item_correlation

# Prediction - Item Item

In [None]:
item_predicted_ratings = np.dot((df_pivot.fillna(0).T),item_correlation)
item_predicted_ratings

In [None]:
item_predicted_ratings.shape

In [None]:
dummy_train.shape

### Filtering the rating only for the movies not rated by the user for recommendation

In [None]:
item_final_rating = np.multiply(item_predicted_ratings,dummy_train)
item_final_rating.head()

### Finding the top 5 recommendation for the *user*



In [None]:
# Take the user ID as input
user_input = input("Enter your user name ")
print(user_input)

In [None]:
# Recommending the Top 5 products to the user.
d = item_final_rating.loc[user_input].sort_values(ascending=False)[0:5]
d

# Evaluation - Item Item

Evaluation will we same as you have seen above for the prediction. The only difference being, you will evaluate for the movie already rated by the user insead of predicting it for the movie not rated by the user. 

In [None]:
test.columns

In [None]:
common =  test[test.name.isin(train.name)]
common.shape

In [None]:
common.head(4)

In [None]:
common_item_based_matrix = common.pivot_table(index='reviews_username', columns='name', values='reviews_rating').T

In [None]:
common_item_based_matrix.shape

In [None]:
item_correlation_df = pd.DataFrame(item_correlation)

In [None]:
item_correlation_df.head(1)

In [None]:
item_correlation_df['name'] = df_subtracted.index
item_correlation_df.set_index('name',inplace=True)
item_correlation_df.head()

In [None]:
list_name = common.name.tolist()

In [None]:
item_correlation_df.columns = df_subtracted.index.tolist()

item_correlation_df_1 =  item_correlation_df[item_correlation_df.index.isin(list_name)]

In [None]:
item_correlation_df_2 = item_correlation_df_1.T[item_correlation_df_1.T.index.isin(list_name)]

item_correlation_df_3 = item_correlation_df_2.T

In [None]:
item_correlation_df_3.head()

In [None]:
item_correlation_df_3[item_correlation_df_3<0]=0

common_item_predicted_ratings = np.dot(item_correlation_df_3, common_item_based_matrix.fillna(0))
common_item_predicted_ratings


In [None]:
common_item_predicted_ratings.shape

Dummy test will be used for evaluation. To evaluate, we will only make prediction on the movies rated by the user. So, this is marked as 1. This is just opposite of dummy_train



In [None]:
dummy_test = common.copy()

dummy_test['reviews_rating'] = dummy_test['reviews_rating'].apply(lambda x: 1 if x>=1 else 0)

dummy_test = dummy_test.pivot_table(index='reviews_username', columns='name', values='reviews_rating').T.fillna(0)

common_item_predicted_ratings = np.multiply(common_item_predicted_ratings,dummy_test)

The products not rated is marked as 0 for evaluation. And make the item- item matrix representaion.


In [None]:
common_ = common.pivot_table(index='reviews_username', columns='name', values='reviews_rating').T

In [None]:
from sklearn.preprocessing import MinMaxScaler
from numpy import *

X  = common_item_predicted_ratings.copy() 
X = X[X>0]

scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))

print(y)

In [None]:
# Finding total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(y))

In [None]:
rmse = (sum(sum((common_ - y )**2))/total_non_nan)**0.5
print(rmse)

In [None]:
# User based Results

In [None]:
d = user_final_rating.loc[user_input].sort_values(ascending=False)[0:20]
d

In [None]:
# Convert data into dataframe

In [None]:
dd = {'product': d.index, 'recomvalue': d.values}

In [None]:
newdf=pd.DataFrame(dd,index=range(0,20))
newdf

In [None]:
lr.predict(tf.transform(["very bad experience boyfriend bought spice thing bedroom highly disappointed product bought one absolutely love ky mine thought would similar affect absolutely nothing buy"]).toarray())

In [None]:
sum(train[train['name'] == 'Planes: Fire Rescue (2 Discs) (includes Digital Copy) (blu-Ray/dvd)']['usersentiment'].values)

In [None]:
sum(train[train['name'] == 'Planes: Fire Rescue (2 Discs) (includes Digital Copy) (blu-Ray/dvd)']['usersentiment'].values)/len(train[train['name'] == 'Planes: Fire Rescue (2 Discs) (includes Digital Copy) (blu-Ray/dvd)'])

In [None]:
positiverating=[]
for i in range(20):
   # print(newdf['product'][i])
    #newdf['positive_rating'][i]=
    positiverating.append(sum(train[train['name'] == newdf['product'][i]]['usersentiment'].values)/len(train[train['name'] == newdf['product'][i]]))

In [None]:
newdf['positiverating']=positiverating

In [None]:
newdf

In [None]:
newdf.sort_values(['positiverating'],ascending=False)

In [None]:
## Top 5
newdf.sort_values(['positiverating'],ascending=False)[:5]