In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from afinn import Afinn

from src.DataIngestor import DataIngestor
from src.DataCleaner import DataCleaner

afinn=Afinn()

In [2]:
#importing and cleaning of googleplaystore dataset

di = DataIngestor()
google_dataframe=di.read_file('./database/googleplaystore.csv')
dc=DataCleaner()
dc.clean_all(google_dataframe)

google_dataframe.head(10)


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159.0,19000000.0,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018"
1,Coloring book moana,ART_AND_DESIGN,3.9,967.0,14000000.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018"
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510.0,8.7,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018"
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644.0,25000000.0,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018"
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967.0,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018"
5,Paper flowers instructions,ART_AND_DESIGN,4.4,167.0,5.6,50000,Free,0.0,Everyone,Art & Design,"March 26, 2017"
6,Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,3.8,178.0,19000000.0,50000,Free,0.0,Everyone,Art & Design,"April 26, 2018"
7,Infinite Painter,ART_AND_DESIGN,4.1,36815.0,29000000.0,1000000,Free,0.0,Everyone,Art & Design,"June 14, 2018"
8,Garden Coloring Book,ART_AND_DESIGN,4.4,13791.0,33000000.0,1000000,Free,0.0,Everyone,Art & Design,"September 20, 2017"
9,Kids Paint Free - Drawing Fun,ART_AND_DESIGN,4.7,121.0,3.1,10000,Free,0.0,Everyone,Art & Design;Creativity,"July 3, 2018"


In [3]:
#importing and cleaning of google reviews dataset

reviews = di.read_file('./database/googleplaystore_user_reviews.csv')
dc.remove_na(reviews,'Translated_Review')
reviews.reset_index(inplace=True)
reviews.drop('index',axis=1,inplace=True)

reviews.head(10)
                 

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
3,10 Best Foods for You,Best idea us,Positive,1.0,0.3
4,10 Best Foods for You,Best way,Positive,1.0,0.3
5,10 Best Foods for You,Amazing,Positive,0.6,0.9
6,10 Best Foods for You,"Looking forward app,",Neutral,0.0,0.0
7,10 Best Foods for You,It helpful site ! It help foods get !,Neutral,0.0,0.0
8,10 Best Foods for You,good you.,Positive,0.7,0.6
9,10 Best Foods for You,Useful information The amount spelling errors ...,Positive,0.2,0.1


In [4]:
#merging of reviews dataset with category column of google dataset. Correct category for each app's review.

reviews_complete=pd.merge(reviews,google_dataframe[['App','Category']],how='left',on='App')
reviews_complete.head(5)

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Category
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333,HEALTH_AND_FITNESS
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462,HEALTH_AND_FITNESS
2,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875,HEALTH_AND_FITNESS
3,10 Best Foods for You,Best idea us,Positive,1.0,0.3,HEALTH_AND_FITNESS
4,10 Best Foods for You,Best way,Positive,1.0,0.3,HEALTH_AND_FITNESS


In [5]:
#creation of the list contains the negative and positive words to use with 'AFINN method'.

negative=pd.read_excel('./database/n.xlsx')
negative=negative.values.tolist()

positive=pd.read_excel('./database/p.xlsx')
positive=positive.values.tolist()

import itertools


lista_appiattita_p = list(itertools.chain.from_iterable(positive))
lista_appiattita_n = list(itertools.chain.from_iterable(negative))

lista = lista_appiattita_n + lista_appiattita_p




In [6]:
#implement a function that replace the Translated_Revies with matched words in the negative and positive words list.

def replace_common_strings(df, col_name, string_list):
    df[col_name] = df[col_name].apply(lambda x: " ".join([string for string in x.split() if string in string_list]))
    return df


In [7]:
#the function work correctly
replace_common_strings(reviews_complete, 'Translated_Review', lista)

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Category
0,10 Best Foods for You,like delicious,Positive,1.000000,0.533333,HEALTH_AND_FITNESS
1,10 Best Foods for You,healthy,Positive,0.250000,0.288462,HEALTH_AND_FITNESS
2,10 Best Foods for You,great,Positive,0.400000,0.875000,HEALTH_AND_FITNESS
3,10 Best Foods for You,,Positive,1.000000,0.300000,HEALTH_AND_FITNESS
4,10 Best Foods for You,,Positive,1.000000,0.300000,HEALTH_AND_FITNESS
...,...,...,...,...,...,...
37422,Housing-Real Estate & Property,wrong,Positive,0.173333,0.486667,LIFESTYLE
37423,Housing-Real Estate & Property,issue improve,Positive,0.225000,0.447222,LIFESTYLE
37424,Housing-Real Estate & Property,,Negative,-0.287500,0.250000,LIFESTYLE
37425,Housing-Real Estate & Property,happy,Positive,0.800000,1.000000,LIFESTYLE


In [8]:
#create a new column with the afinn score for each reviews
reviews_complete['AFINN_Score']=reviews_complete['Translated_Review'].apply(afinn.score)
reviews_complete.head(10)

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Category,AFINN_Score
0,10 Best Foods for You,like delicious,Positive,1.0,0.533333,HEALTH_AND_FITNESS,5.0
1,10 Best Foods for You,healthy,Positive,0.25,0.288462,HEALTH_AND_FITNESS,2.0
2,10 Best Foods for You,great,Positive,0.4,0.875,HEALTH_AND_FITNESS,3.0
3,10 Best Foods for You,,Positive,1.0,0.3,HEALTH_AND_FITNESS,0.0
4,10 Best Foods for You,,Positive,1.0,0.3,HEALTH_AND_FITNESS,0.0
5,10 Best Foods for You,,Positive,0.6,0.9,HEALTH_AND_FITNESS,0.0
6,10 Best Foods for You,,Neutral,0.0,0.0,HEALTH_AND_FITNESS,0.0
7,10 Best Foods for You,helpful,Neutral,0.0,0.0,HEALTH_AND_FITNESS,2.0
8,10 Best Foods for You,good,Positive,0.7,0.6,HEALTH_AND_FITNESS,3.0
9,10 Best Foods for You,errors,Positive,0.2,0.1,HEALTH_AND_FITNESS,-2.0


In [9]:
#create a new column with the sign of afinn score: 1 for positive afinn score, 0 for neutral, -1 for negative.
reviews_complete["sign_afinn"]=np.sign(reviews_complete['AFINN_Score'])

In [10]:
#Creation of a function that return a mean of afinn score for app.

def gradimento_medio(dataframe, app):
    return dataframe[dataframe["App"] == app].groupby("App").AFINN_Score.mean()


In [11]:
#Creation of a function that return a gradient of sentiment for app from -1 to 1.
#This means the mean of total signs of afinn score for each app.

def indice_gradimento(dataframe, app):
    return dataframe[dataframe["App"] == app].groupby("App").sign_afinn.mean()

In [12]:
#test1
indice_gradimento(reviews_complete,'10 Best Foods for You')

App
10 Best Foods for You    0.381443
Name: sign_afinn, dtype: float64

In [13]:
gradimento_medio(reviews_complete,'10 Best Foods for You')

App
10 Best Foods for You    1.546392
Name: AFINN_Score, dtype: float64