###  Step 1 : Importing necessary Libaries 

In [78]:
import numpy as np # for working with arrays,linear algebra, matrices 
import pandas as pd # for data manipulation 
import nltk # for static nlp 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re #raise exception if occurs
from textblob import TextBlob
from wordcloud import WordCloud # highlight popular words for analysis 
import seaborn as sns # for data visualization 
import matplotlib.pyplot as plt # for data visualization 
import cufflinks as cf #links matplotlib with pandas  
#to display plots inside notebook, basically for offline graphics 
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected = True)
cf.go_offline();
# plotly defines fundamental classes 
import plotly.graph_objs as go 
from plotly.subplots import make_subplots 

import warnings 
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

pd.set_option('display.max_columns', None)


In [79]:
df = pd.read_csv("amazon.csv")


### step 2:  Data manipulation 

In [80]:
df = df.sort_values("wilson_lower_bound", ascending=False)
df.drop('Unnamed: 0', inplace= True, axis= 1)
df.head()

Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
2031,"Hyoun Kim ""Faluzure""",5,[[ UPDATE - 6/19/2014 ]]So my lovely wife boug...,05-01-2013,702,1952,68,2020,1884,0.966337,0.957544
3449,NLee the Engineer,5,I have tested dozens of SDHC and micro-SDHC ca...,26-09-2012,803,1428,77,1505,1351,0.948837,0.936519
4212,SkincareCEO,1,NOTE: please read the last update (scroll to ...,08-05-2013,579,1568,126,1694,1442,0.92562,0.912139
317,"Amazon Customer ""Kelly""",1,"If your card gets hot enough to be painful, it...",09-02-2012,1033,422,73,495,349,0.852525,0.818577
4672,Twister,5,Sandisk announcement of the first 128GB micro ...,03-07-2014,158,45,4,49,41,0.918367,0.808109


- **sort** : sorts the value of wilson_lower_bound with highest value at top 
- **drop** : deletes the column named `Unnamed: 0 ` 
- **inplace=True** : Make modification in original dataset 

#### 2.1 Handling Missing values 

In [81]:
# Function to analyze missing values in a DataFrame
def missing_values_analysis(df):
    # Identify columns with missing values
    na_columns_ = [col for col in df.columns if df[col].isnull().sum() > 0]
    
    # Count missing values and calculate the ratio
    n_miss = df[na_columns_].isnull().sum().sort_values(ascending=True)
    ratio_ = (df[na_columns_].isnull().sum() / df.shape[0] * 100).sort_values(ascending=True)
    
    # Create a DataFrame to display missing values and ratios
    missing_df = pd.concat([n_miss, np.round(ratio_, 2)], axis=1, keys=['Missing values', 'Ratio'])
    
    return missing_df



### step 3. Function to check and summarize DataFrame information


In [82]:
# Function to check and summarize DataFrame information
def check_dataframe(df, head=5, tail=5):
    # Display shape of the DataFrame
    print("Shape".center(82, '~'))
    print("Rows: {}".format(df.shape[0]))
    print("Columns: {}".format(df.shape[1]))
    
    # Display data types of columns
    print("Types".center(82, '~'))
    print(df.dtypes)
    
    # Display missing values analysis
    print("".center(82, '~'))
    print(missing_values_analysis(df))
    
    # Display number of duplicated values
    print("Duplicated Values".center(83, '~'))
    print(df.duplicated().sum())
    
    # Display quantiles of the DataFrame
    print("Quantiles".center(82, '~'))
    print(df.describe().T)  # Use df.describe() instead of df.quantile()

# Example usage
check_dataframe(df)


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Shape~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Rows: 4915
Columns: 11
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Types~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
reviewerName             object
overall                   int64
reviewText               object
reviewTime               object
day_diff                  int64
helpful_yes               int64
helpful_no                int64
total_vote                int64
score_pos_neg_diff        int64
score_average_rating    float64
wilson_lower_bound      float64
dtype: object
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
              Missing values  Ratio
reviewerName               1   0.02
reviewText                 1   0.02
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Duplicated Values~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
0
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Quantiles~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                       count        mean         std    min    25%    50%  \
overall 

checking a particular row comment

In [83]:
review_example = df.reviewText[2031]
review_example

'[[ UPDATE - 6/19/2014 ]]So my lovely wife bought me a Samsung Galaxy Tab 4 for Father\'s Day and I\'ve been loving it ever since.  Just as other with Samsung products, the Galaxy Tab 4 has the ability to add a microSD card to expand the memory on the device.  Since it\'s been over a year, I decided to do some more research to see if SanDisk offered anything new.  As of 6/19/2014, their product lineup for microSD cards from worst to best (performance-wise) are the as follows:SanDiskSanDisk UltraSanDisk Ultra PLUSSanDisk ExtremeSanDisk Extreme PLUSSanDisk Extreme PRONow, the difference between all of these cards are simply the speed in which you can read/write data to the card.  Yes, the published rating of most all these cards (except the SanDisk regular) are Class 10/UHS-I but that\'s just a rating... Actual real world performance does get better with each model, but with faster cards come more expensive prices.  Since Amazon doesn\'t carry the Ultra PLUS model of microSD card, I had 

Cleaning the particular row comment 

In [84]:
# removing all the puntucation from particular comment 
review_example = re.sub("[^a-zA-Z]","", review_example)

# converting all to lowercase and splitting them in new line 
review_example = review_example.lower().split()

review_example

['updatesomylovelywifeboughtmeasamsunggalaxytabforfathersdayandivebeenlovingiteversincejustasotherwithsamsungproductsthegalaxytabhastheabilitytoaddamicrosdcardtoexpandthememoryonthedevicesinceitsbeenoverayearidecidedtodosomemoreresearchtoseeifsandiskofferedanythingnewasoftheirproductlineupformicrosdcardsfromworsttobestperformancewisearetheasfollowssandisksandiskultrasandiskultraplussandiskextremesandiskextremeplussandiskextremepronowthedifferencebetweenallofthesecardsaresimplythespeedinwhichyoucanreadwritedatatothecardyesthepublishedratingofmostallthesecardsexceptthesandiskregularareclassuhsibutthatsjustaratingactualrealworldperformancedoesgetbetterwitheachmodelbutwithfastercardscomemoreexpensivepricessinceamazondoesntcarrytheultraplusmodelofmicrosdcardihadtododirectcomparisonsbetweenthesandiskultraextremeandextremeplusasmentionedinmyearlierreviewipurchasedthesandiskultraformygalaxysmyquestionwasdidiwanttopayovermoreforacardthatisfasterthantheoneialreadyownedoricouldpayalmostdoubletoge

Applying the same changes to the whole dataset 


In [85]:
rt = lambda x: re.sub("[^a-zA-Z]", ' ', str(x))
df["reviewText"] = df["reviewText"].map(rt)
df["reviewText"] = df["reviewText"].str.lower()
df.head()


Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
2031,"Hyoun Kim ""Faluzure""",5,update so my lovely wife boug...,05-01-2013,702,1952,68,2020,1884,0.966337,0.957544
3449,NLee the Engineer,5,i have tested dozens of sdhc and micro sdhc ca...,26-09-2012,803,1428,77,1505,1351,0.948837,0.936519
4212,SkincareCEO,1,note please read the last update scroll to ...,08-05-2013,579,1568,126,1694,1442,0.92562,0.912139
317,"Amazon Customer ""Kelly""",1,if your card gets hot enough to be painful it...,09-02-2012,1033,422,73,495,349,0.852525,0.818577
4672,Twister,5,sandisk announcement of the first gb micro ...,03-07-2014,158,45,4,49,41,0.918367,0.808109


Performing sentiment analysis 

In [86]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Using TextBlob for polarity and subjectivity
df[['polarity', 'subjectivity']] = df['reviewText'].apply(lambda text: pd.Series(TextBlob(str(text)).sentiment))

# Using VaderSentiment for sentiment analysis
for index, row in df['reviewText'].items():
    score = SentimentIntensityAnalyzer().polarity_scores(str(row))
    
    neg = score['neg']
    neu = score['neu']
    pos = score['pos']
    
    if neg > pos: 
        df.loc[index, 'sentiment'] = "Negative"
    elif pos > neg: 
        df.loc[index, 'sentiment'] = "Positive"
    else:
        df.loc[index, 'sentiment'] = "Neutral"


In [None]:
df[df["sentiment"] ==  "Positive"].sort_values("wilson_lower_bound", ascending = False).head()

catagorize data into positive neagative and neutral 


In [None]:
import matplotlib.pyplot as plt

# Function to visualize sentiment distribution using both bar plot and pie chart
def plot_combined_chart(sentiment_summary):
    """
    Plots are a combined chart with both bar plot and pie chart to visualize the distribution of sentiments.

    """
    # Create subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

    # Pie chart
    labels = sentiment_summary['sentiment']
    sizes = sentiment_summary['Count']
    colors = ['lightcoral', 'lightblue', 'lightgreen']
    ax1.pie(sizes, labels=labels, autopct='%1.1f%%', colors=colors, startangle=90)
    ax1.set_title('Sentiment Distribution (Pie Chart)')

    # Bar plot
    ax2.bar(sentiment_summary['sentiment'], sentiment_summary['Count'], color=['lightcoral', 'lightblue', 'lightgreen'])
    ax2.set_xlabel('Sentiment')
    ax2.set_ylabel('Count')
    ax2.set_title('Sentiment Distribution (Bar Plot)')

    # Adjust layout
    plt.tight_layout()

    # Show the combined chart
    plt.show()

# Example usage:
sentiment_summary = categorical_variable_summary(df, 'sentiment')
plot_combined_chart(sentiment_summary)
