In [2]:
import numpy as np
import pandas as pd
from google_play_scraper import reviews, reviews_all
from google_play_scraper import Sort

In [3]:
apps = {
    "Mobicip" : "mobicip.com.safeBrowserff",
    "Google Family Link" : "com.google.android.apps.kids.familylink",
    "Net Nanny" : "com.contentwatch.ghoti.cp2.parent",
    "Kaspersky SafeKids" : "com.kaspersky.safekids"
}

In [4]:
sort = Sort.NEWEST
n_reviews = 10000
reviews_dict = {k : {} for k in apps}

In [5]:
for app in apps.keys():
        reviews_dict[app] = reviews_all(
                apps[app],
                lang = 'en',
                sleep_milliseconds=0,
                country = 'us',
                sort = sort,
                count = n_reviews,
                filter_score_with = None
        )

In [17]:
# reviews_dict['Mobicip']

In [6]:
df = pd.DataFrame()
for app in apps.keys():
        SAVE_DIR = app + '.csv'
        temp_df = pd.DataFrame(
                reviews_dict[app],
                columns = ['reviewId', 'userName', 'content', 'score'] 
        )
        temp_df.to_csv(SAVE_DIR, index = False)
        temp_df["app"] = app
        df = pd.concat((df ,temp_df))
df.to_csv("all_combined.csv", index = False)

In [7]:
df['feedback'] = df['score'].apply(lambda x: 0 if x < 3 else 1)

In [8]:
X = df['content']
y = df['feedback']

In [18]:
# df.head()

In [9]:
df.shape

(1592, 6)

In [10]:
print("Number of words: ")
print(len(np.unique(np.hstack(X))))

Number of words: 
1514


In [11]:
X.isnull().values.any()

False

In [12]:
y.value_counts()

feedback
1    819
0    773
Name: count, dtype: int64

In [13]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
        doc = nlp(text)
        filtered_tokens = []
        for token in doc:
                if token.is_stop or token.is_punct:
                        continue
                filtered_tokens.append(token.lemma_)
        
        return " ".join(filtered_tokens)

In [14]:
df['preprocessed_text'] = df['content'].apply(preprocess)

In [15]:
df['preprocessed_text'].head()

0                                           waste time
1    extremely impressed Mobicip Customer Service q...
2                                             friendly
3                                                 work
4                                      Invades privacy
Name: preprocessed_text, dtype: object

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['preprocessed_text'], 
    df['feedback'],
    test_size = 0.2,
    random_state = 348,
    stratify = df['feedback']
)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

Tokenizer = TfidfVectorizer()
X_train_2 = Tokenizer.fit_transform(X_train).toarray()
X_test_2 = Tokenizer.transform(X_test).toarray()

In [27]:
X_train_2.shape

(1273, 2763)

In [21]:
from keras.layers import Dense, Dropout
from keras.models import Sequential

ann_model = Sequential([
        Dense(units = 1024, input_dim = X_train_2.shape[1], activation = 'relu'),
        Dense(units = 128, activation = 'relu'),
        Dense(units = 1, activation = 'sigmoid')
])

ann_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])

In [24]:
history = ann_model.fit(X_train_2, y_train, batch_size=128, epochs=10, validation_split=0.05)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
ann_score = ann_model.evaluate(X_test_2, y_test, verbose = 1)

