In [1]:
import csv
import pandas as pd
import numpy as np

In [2]:
# adds the rating level as a categorical (nominal value)
def add_rating_level(dataframe):
    rating_level = []
    for i in list(dataframe.Rating):
        if i >= 4.5:
            rating_level.append('high')
        elif i < 4:
            rating_level.append('low')
        else:
            rating_level.append('mid')
    dataframe['Rating_level'] = rating_level
    return dataframe

In [3]:
# removes the extra characters from the numerical columns 
def remove_extras(dataframe): 
    # add a new column with the lenghts of the names of the apss
    dataframe['AppNameLen'] = dataframe['App'].str.len()
    
    # Removes the M+ and the varies with the device from the ratings
    dataframe['Size'] = dataframe['Size'].str.replace(r'M+', '')
    dataframe['Size'] = dataframe['Size'].str.replace(r'Varies with device', ' ')
    
    # removes the "+" from the installs
    dataframe['Installs'] = dataframe['Installs'].str.replace(r'+', '')
    
    # Splits the genres column
    dataframe[['Genre1','Genre2']] = dataframe.Genres.apply(lambda x: pd.Series(str(x).split(";"))) 
    dataframe['Genre2'] = dataframe['Genre2'].str.replace(r'NaN', '')
    
    return dataframe

In [4]:
# calclates the last update on an app
def last_version_app(dataframe):
    
    # Scrapped timestamp retrieved from the last update 
    # that contains all three files on kaggle
    scrappedDate = pd.Timestamp('2018-09-18 17:27:29')
    scrappedDate = pd.to_datetime(scrappedDate).date()  

    # time since last update
    time = []
    updated_time = []
    for i in range (0, len(dataframe['Last Updated'])):
        if dataframe['Last Updated'][i] == "1.0.19":
            dataframe['Last Updated'][i] = pd.Timestamp('2019-01-01').date()
            time.append(pd.Timestamp(dataframe['Last Updated'][i]).date())
        else:
            time.append(pd.Timestamp(dataframe['Last Updated'][i]).date())

    # difference in months from the last update to the scrapped date        
    for i in range(0, len(time)):
        time[i] = pd.to_datetime(time[i]).date()    
        updated_time.append(scrappedDate.month - time[i].month) 

    # makes the list intervals so it fits in the column
    vals = pd.Series(updated_time) 
    dataframe['Updated_time'] = vals.values

    return dataframe

In [5]:
# separates the minimum requirement for in the version of android
def version_needed(dataframe):
    
    Android_version_needed = []
    for i in list(dataframe['Android Ver']):
        version = ''
        try:
            lst = i.split()
            v = ''.join(lst[0].split('.'))
            if len(v) < 3:
                v += '0'
            if v[2] not in ['0','1','2','3','4','5','6','7','8','9']:
                v = v[0:2]+'0'
            if v[0] not in ['0','1','2','3','4','5','6','7','8','9']:
                v = '000'
            if v[1] not in ['0','1','2','3','4','5','6','7','8','9']:
                v = '000'
            if v[2] not in ['0','1','2','3','4','5','6','7','8','9']:
                v = '000'
            if len(v) > 3:
                v = '0'
        except Exception as e:
            v = '0'
        if v == '000':
            v = '0'
        Android_version_needed.append(int(v))

    dataframe['Android_version_needed'] = Android_version_needed
    return dataframe

In [6]:
# creates the category based on the number of installs made for that app
def classification_installs(dataframe):
    install_category = []
    
    for i in list(dataframe.Installs):
        if i == 'Free':
            clean = 0
        else:
            num = i.strip('-+')
            clean = ''.join(num.split(','))
        install_category.append(int(clean))
   
    dataframe['install_category'] = install_category
    
    return dataframe

In [7]:
# creates the main categories of installations based on the classification
def categorize_installs(dataframe):
    install_class = []
    
    for i in list(dataframe.install_category):
        if int(i) > 500000:
            class0 = 'high'
        elif int(i) < 10000:
            class0 = 'low'
        else:
            class0 = 'mid'
        install_class.append(class0)
    
    dataframe['install_class'] = install_class
    
    return dataframe

In [8]:
# list1 apps
# list2 appname  (list of names with no duplicates)
# list3 sents   
# list4 sent  (list of the 3 unique values)

def get_dictionary(list1, list2, list3, list4):
    
    all_values = dict()
    count_pos, count_neg, count_neut = 0, 0, 0
    # calculates percentage based on sentiment per app    
    for k in range(0, len(list1)): 
        for l in range(0, len(list2)):        
            if list1[k] == list2[l]:
                if list3[k] == list4[0]:
                    count_neg = count_neg + 1
                elif list3[k] == list4[1]:
                    count_neut = count_neut + 1
                elif list3[k] == list4[2]:
                    count_pos = count_pos + 1 
                        
            all_values[list1[k]] = [(count_pos/len(list1)*100), (count_neg/len(list1)*100), (count_neut/len(list1)*100)]

    return all_values 

In [9]:
# if an app has n reviews in total and among all reviews, a reviews’ sentiment is positive, b reviews’ sentiment is neutral 
# and c reviews’ sentiment is negative, then three values added for this app should be a/n, b/n, c/n

def create_sentiment_dicitionary(dataframe):
    # removes the nan rows from the dataset
    dataframe = dataframe.dropna()
    
    # keeps the app names and the sentiment (+,- =)
    apps, sents = [], []
    
    # The dictionary that will return the sentiment calculated for each app listed
    all_values = dict()

    # gets all the unique App names form all rows in the dataset and the sentiment values
    appName = np.unique(dataframe['App']).tolist()
    sent = np.unique(dataframe['Sentiment']).tolist()

    for i in dataframe['App']:
        apps.append(i)

    for j in dataframe["Sentiment"]:
        sents.append(j)

    all_values = get_dictionary(apps, appName, sents, sent)
    
    return all_values

In [10]:
def calculate_sentiment(dataframe):
    dftest =  pd.DataFrame(list(create_sentiment_dicitionary(dataframe).items()), columns=['App', 'Sent'])
    dftest[['Positive_sentiment','Negative_Sentiment', 'Neutral_sentiment']] = dftest.Sent.apply(lambda x: pd.Series(str(x).split(" "))) 
    del dftest['Sent']
        
    # merges the dataframe 2 with the one containing only the calculated values for sentiment 
    data2 = pd.merge(dataframe, dftest, on='App')
    
    # return the newly dataset on reviews
    return data2

In [11]:
def merge_datasets(dataframe1, dataframe2):

    # merge both dataframes into one full dataset
    dataset = pd.merge(dataframe1, dataframe2, on='App')

    return dataset

In [14]:
def main():
    
    # the files 
    inputfile1 = "C:/Users/Wendy/Desktop/Final Project/google-play-store-apps/googleplaystore.csv"
    inputfile2 = "C:/Users/Wendy/Desktop/Final Project/google-play-store-apps/googleplaystore_user_reviews.csv"

    # imports file 
    df1 = pd.read_csv(inputfile1)
    df2 = pd.read_csv(inputfile2)
    
    # performs all the cleaning on the first dataframe
    dat1 = add_rating_level(df1)
    dat2 = remove_extras(dat1)
    dat3 = last_version_app(dat2)
    dat4 = version_needed(dat2)
    dat5 = classification_installs(dat4)
    dat6 = categorize_installs(dat5)
    
    new_df2 = calculate_sentiment(df2)
    
    
    data =  merge_datasets(dat6, new_df2)
    
    # removes the "+" from the installs
    data['Positive_sentiment'] = data['Positive_sentiment'].str.replace(r'[', '')
    data['Positive_sentiment'] = data['Positive_sentiment'].str.replace(r',', '')
    data['Negative_sentiment'] = data['Negative_Sentiment'].str.replace(r',', '')
    data['Neutral_sentiment'] = data['Neutral_sentiment'].str.replace(r']', '')

    print(data)
    
    data.to_csv('fullDataset.csv')
    

In [15]:
if __name__ == "__main__" :
    main()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


                                 App        Category  Rating Reviews Size  \
0                Coloring book moana  ART_AND_DESIGN     3.9     967   14   
1                Coloring book moana  ART_AND_DESIGN     3.9     967   14   
2                Coloring book moana  ART_AND_DESIGN     3.9     967   14   
3                Coloring book moana  ART_AND_DESIGN     3.9     967   14   
4                Coloring book moana  ART_AND_DESIGN     3.9     967   14   
5                Coloring book moana  ART_AND_DESIGN     3.9     967   14   
6                Coloring book moana  ART_AND_DESIGN     3.9     967   14   
7                Coloring book moana  ART_AND_DESIGN     3.9     967   14   
8                Coloring book moana  ART_AND_DESIGN     3.9     967   14   
9                Coloring book moana  ART_AND_DESIGN     3.9     967   14   
10               Coloring book moana  ART_AND_DESIGN     3.9     967   14   
11               Coloring book moana  ART_AND_DESIGN     3.9     967   14   