In [None]:
# First, let's import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from google.colab import files
data_to_load = files.upload()

In [None]:
# Read the csv file for analysis
df = pd.read_csv('Flipkart_ratings.csv')
df.head()

### Getting general information about our dataset

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
# Check if there are any null values or not
df.isnull().sum()

In [None]:
# Check if there are any duplicate entries or not
df.duplicated().sum()

In [None]:
df['product_title'].nunique()

In [None]:
# A Violin Plot is used to visualise the distribution of the data and its probability density.Here,this plot shows total votes
#  count based on product title 
plt.figure(figsize=(20,12))
sns.violinplot(df['product_title'],df['total_votes'],data=df)

In [None]:
# This plot shows count of total votes based on star rating
plt.figure(figsize=(15,10))
sns.barplot(df['star_rating'],df['total_votes'],data=df)

In [None]:
plt.figure(figsize=(10,10))
sns.lineplot(df['verified_purchase'],df['star_rating'],data=df)

In [None]:
# In lineplot() Draw a line plot with the possibility of several semantic groupings. 
# These parameters control what visual semantics are used to identify the different subsets.¶
# In this plot we have plotted verified purchase VS star rating plot based on total votes
plt.figure(figsize=(10,10))
sns.lineplot(df['verified_purchase'],df['star_rating'],hue=df['total_votes'],data=df)

In [None]:
# This plot shows verfifed purchase graph based on helpful votes which we got from customer
plt.figure(figsize=(15,12))
sns.lineplot(df['helpful_votes'],df['verified_purchase'],data=df)

In [None]:
df['Sentiment'] = [1 if x > 3 else 0 for x in df.star_rating]  #0-negative 1-positive

In [None]:
df['Sentiment']

In [None]:
# Sentiment VS star rating plot
sns.barplot(x=df['Sentiment'],y=df['star_rating'])

## Final Approach

In [None]:
review = df['review_headline']

In [None]:
import nltk

In [None]:
from nltk.corpus import stopwords 
#Stopwords are the English words which does not add much meaning to a sentence. They can safely be ignored without sacrificing the meaning of the sentence.
from nltk.stem import SnowballStemmer
#It is a stemming algorithm.stemming is reducing a word to its base word or stem in such a way that the words of similar kind lie under a common stem.
stop_words = stopwords.words('english')

In [None]:
snow = SnowballStemmer('english') #the stemmer requires a language parameter

In [None]:
pip install emoji

In [None]:
import emoji
import re
from nltk.tokenize import word_tokenize

In [None]:
review = [re.sub("!","",txt) for txt in review] # used to replace occurrences of special characters with white space  .
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
def give_emoji_free_text(comm): # function to remove emojis from reviews
    allchars = [str for str in comm]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in comm.split() if not any(i in str for i in emoji_list)])
    return clean_text

review = [give_emoji_free_text(txt) for txt in review]
review = [give_emoji_free_text(txt) for txt in review]
#from all reviews , readmore, -,\d is replaced with space. 
review = [re.sub("READ MORE|READ MOR|READ MO","",i) for i in review]
review = [re.sub('-','',i) for i in review]
review = [re.sub('\d','',i) for i in review]

In [None]:
text =[]
setstop = ['money','purchase','penny','market','product','every']
stopw = (stopwords.words('english'))
stopw.extend(setstop) # extending the stop words list by adding setstop
stopw = set(stopw)

for message in review:
    message = message.lower().strip() # converting review to lower case and also removing both leading and trailing characters.
    words = [snow.stem(word) for word in message.split() if word not in stopw] #checking that if stop words are present in message, if word not in stopwords we will just stem it.
    text.append(' '.join(words)) #it will create a new list of all the above stemmed words.

In [None]:
import nltk
nltk.download('punkt')
#This tokenizer divides a text into a list of sentences, by using an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences

In [None]:
v= []
for val in text:
    val = str(val)
    tokens = val.split() #spliting all words in text
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower() # converting each word into lower case
    v.append(" ".join(tokens)) # creating a new list v with all lower case tokens
for word in range(len(v)):
    token = word_tokenize(v[word]) #for splitting strings into tokens (nominally words). It splits tokens based on white space and punctuation.
    for tkn in token:
        if tkn in stopw:# checking if any stopwords present if present we will remove it.
            token.remove(tkn)
    v[word] = " ".join([str(ele) for ele in token]) 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
#Convert a collection of text documents to a matrix of token counts.
#Convert a collection of raw documents to a matrix of TF-IDF features.
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score

In [None]:
count_vect = CountVectorizer(max_features=5000) #Initialize a CountVectorizer object: count_vectorizer
bow_data = count_vect.fit_transform(v).toarray() #Transforms the data into a bag of words

In [None]:
count_vect.get_feature_names()[:10]

In [None]:
tf_idf = TfidfVectorizer() #initialise tfid object
tf_data = tf_idf.fit_transform(v).toarray()
tf_data.shape

In [None]:
z = []
import random
for i in range(len(v)):
    c = random.randint(0,1) # taking a raandom number between 0-1
    z.append(c) #appending it in z
df['Score'] = z
y = df['Score']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(tf_data,y) # splitting data into train and test 

In [None]:
x_train.shape

In [None]:
y_test.shape

In [None]:
lr = LogisticRegression(class_weight='balanced') # intialize a logistic regression function
lr.fit(x_train,y_train) # fitting x_train and y-train data in model
y_pred = lr.predict(x_test) #predicting the results with x-test

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
accuracy_score(y_train,lr.predict(x_train))

In [None]:
cm = confusion_matrix(y_test,y_pred)
cm

# Trying Textblob

In [None]:
from textblob import TextBlob
#TextBlob is a Python  library for processing textual data. It provides a simple API for diving into common natural language processing (NLP) tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more.

from textblob import TextBlob
score = []
for i in range(len(v)):
    bl = TextBlob(v[i])
    score.append(bl.sentiment.polarity) # sentiment polarity score which tells if the sentiment is positive or negative

In [None]:
for i in range(len(score)):
    if score[i] < 0.0: #negative score
        score[i] = 0
    elif score[i] > 0.0:#positive score
        score[i] = 1
    elif score[i] == 0.0: #neutral
        score[i] = 0

In [None]:
df['Score'] = score

In [None]:
df['Score'].value_counts()

In [None]:
df.review_headline = v
vect = TfidfVectorizer() #intialising tfid object
X=vect.fit_transform(df.review_headline).toarray() 

X_df = pd.DataFrame(X,columns = vect.get_feature_names())#creating a dataframe with X rows and columns present in vect

In [None]:
y = df['Score']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_df,y,test_size=0.7,random_state=123,stratify=y) #splitting data into train and test sets
log_reg = LogisticRegression(penalty='l2',C=1.0).fit(X_train,y_train) #applying logistic regression to predict score.
print("Trainig set:",log_reg.score(X_train,y_train))
print("Testing set:",log_reg.score(X_test,y_test))

In [None]:
y_pred = log_reg.predict(X_test) #predicting for X-test data

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
accuracy_score(y_train,log_reg.predict(X_train))

In [None]:
cm = confusion_matrix(y_test,y_pred) # creating confusion matrix
cm