## Prerequisite

In [1]:
!pip install google-play-scraper



Import libraries

In [2]:
from google_play_scraper import app, Sort, reviews_all
import pandas as pd
import numpy as np
import json, os, uuid
from sklearn.model_selection import train_test_split

import re
import nltk
from nltk.corpus import stopwords
import tensorflow as tf
from keras.preprocessing.text import one_hot, Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dropout, Dense
from keras.layers import Flatten, GlobalMaxPooling1D, Embedding, Conv1D, LSTM

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Collect google play reviews using google-play-scraper

In [4]:
g_reviews = reviews_all(
        "mobicip.com.safeBrowserff", # application ID
        sleep_milliseconds=0, # defaults to 0
        lang='en', # defaults to 'en'
        country='us', # defaults to 'us'
        sort=Sort.NEWEST, # defaults to Sort.MOST_RELEVANT
    )

## Data pre-processing

### Arrange data into dataframes

In [5]:
g_df = pd.DataFrame(np.array(g_reviews),columns=['review'])
g_df2 = g_df.join(pd.DataFrame(g_df.pop('review').tolist()))

In [6]:
g_df2.drop(columns={'userImage', 'reviewCreatedVersion'},inplace = True) # drop unnecessary columns
g_df2.rename(columns= {'score': 'rating','userName': 'user_name', 'reviewId': 'review_id', 'content': 'review_description', 'at': 'review_date', 'replyContent': 'developer_response', 'repliedAt': 'developer_response_date', 'thumbsUpCount': 'thumbs_up'},inplace = True)
# g_df2.insert(loc=0, column='source', value='Google Play')
# g_df2.insert(loc=3, column='review_title', value=None)
# g_df2['laguage_code'] = 'en'
# g_df2['country_code'] = 'us'

In [7]:
g_df2.shape

(597, 9)

In [8]:
# storing into a .csv file for future use

g_df2.to_csv('play_reviews.csv')

In [9]:
g_df2['rating'].value_counts()

1    305
5    195
2     46
3     30
4     21
Name: rating, dtype: int64

In [10]:
g_df2.head()

Unnamed: 0,review_id,user_name,review_description,rating,thumbs_up,review_date,developer_response,developer_response_date,appVersion
0,9b96fe32-d8bc-448f-bf37-eaa79506a6b6,Tugsan Topcuoglu,Waste of time,1,0,2024-03-05 22:28:33,,NaT,2.2.8_r824
1,d5cb1046-ee6d-4fc5-ac03-c873e6ded9df,Roxanne LaRusso,I am extremely impressed with Mobicip's Custom...,5,0,2024-02-26 14:15:52,,NaT,
2,fe265e2e-117d-42b2-b8ca-63fc24e551b3,Williamcadder Mutevedzi,Friendly,5,0,2024-02-24 16:43:47,,NaT,
3,f44146dc-a7a3-445e-8183-6b153f482555,Justyna D,Does not work,1,0,2024-02-21 19:36:51,,NaT,2.2.7_r812
4,69bc7a7b-1cca-409f-96b1-8b19288ab427,Liam Whitwam,Invades privacy,1,0,2024-02-13 03:48:45,,NaT,2.2.4_r791


### Establishing data (X, Y)

In [11]:
# read data from csv

df = pd.read_csv('/content/play_reviews.csv')

In [12]:
# create new column for binary classification

df['feedback'] = df['rating'].apply(lambda x: 0 if x < 3 else 1)

In [13]:
# declare variables

X = df['review_description']
Y = df['feedback']

In [14]:
# get count of vocabulary

print("Number of words: ")
print(len(np.unique(np.hstack(X))))

Number of words: 
593


In [15]:
# check if any null values are present in reviews part

X.isnull().values.any()

False

### Preparing data for embedding

In [16]:
stopwords_list = set(stopwords.words('english'))

In [17]:
class CustomPreprocess():

    def __init__(self):
        pass

    def preprocess_text(self,sen):
        sen = sen.lower()

        # # Remove html tags
        # sentence = remove_tags(sen)

        # Remove punctuations and numbers
        sen = re.sub('[^a-zA-Z]', ' ', sen)

        # Single character removal
        sen = re.sub(r"\s+[a-zA-Z]\s+", ' ', sen)

        # Remove multiple spaces
        sen = re.sub(r'\s+', ' ', sen)

        # Remove Stopwords
        pattern = re.compile(r'\b(' + r'|'.join(stopwords_list) + r')\b\s*')
        sen = pattern.sub('', sen)

        return sen

In [18]:
# define preprocessing

custom = CustomPreprocess()
X_new = []
sentences = list(X)
for sen in sentences:
  X_new.append(custom.preprocess_text(sen))

In [101]:
type(X_new[0])

str

In [19]:
# split data into training and testing sets

X_train, X_test, Y_train, Y_test = train_test_split(X_new, Y, test_size = 0.25)

## Word embeddings

In [20]:
# tokenize the sentences into arrays of words

word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_new)

X_train = word_tokenizer.texts_to_sequences(X_train)
X_test = word_tokenizer.texts_to_sequences(X_test)

In [21]:
vocab_length = len(word_tokenizer.word_index) + 1

vocab_length

2242

In [22]:
# Padding all reviews to fixed length 100

X_train = pad_sequences(X_train, padding='post', maxlen = 100)
X_test = pad_sequences(X_test, padding='post', maxlen = 100)

In [23]:
# Load GloVe word embeddings and create an Embeddings Dictionary

from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('/content/drive/MyDrive/Datasets/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions

glove_file.close()

In [24]:
# Create Embedding Matrix having 100 columns
# Containing 100-dimensional GloVe word embeddings for all words in our corpus.

embedding_matrix = zeros((vocab_length, 100))

for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

## Models

### ANN

In [25]:
# simple neural network

ann_model = Sequential([
    Embedding(vocab_length, 100, weights = [embedding_matrix], input_length = 100, trainable = False),
    Flatten(),
    Dense(1, activation = 'sigmoid'),
])

# compile the model

ann_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])

print(ann_model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          224200    
                                                                 
 flatten (Flatten)           (None, 10000)             0         
                                                                 
 dense (Dense)               (None, 1)                 10001     
                                                                 
Total params: 234201 (914.85 KB)
Trainable params: 10001 (39.07 KB)
Non-trainable params: 224200 (875.78 KB)
_________________________________________________________________
None


In [26]:
# train the model

ann_history = ann_model.fit(X_train, Y_train, batch_size = 128, epochs = 5, verbose = 2, validation_split = 0.2)

Epoch 1/5
3/3 - 2s - loss: 0.7184 - acc: 0.4986 - val_loss: 0.6603 - val_acc: 0.6556 - 2s/epoch - 546ms/step
Epoch 2/5
3/3 - 0s - loss: 0.6390 - acc: 0.6387 - val_loss: 0.6391 - val_acc: 0.7000 - 96ms/epoch - 32ms/step
Epoch 3/5
3/3 - 0s - loss: 0.5859 - acc: 0.7703 - val_loss: 0.6262 - val_acc: 0.7444 - 106ms/epoch - 35ms/step
Epoch 4/5
3/3 - 0s - loss: 0.5490 - acc: 0.8039 - val_loss: 0.6136 - val_acc: 0.7667 - 96ms/epoch - 32ms/step
Epoch 5/5
3/3 - 0s - loss: 0.5201 - acc: 0.8487 - val_loss: 0.5988 - val_acc: 0.7667 - 138ms/epoch - 46ms/step


In [27]:
# get the loss and accuracy

ann_score = ann_model.evaluate(X_test, Y_test, verbose = 1)



### CNN

In [28]:
from keras.layers import Conv1D

In [29]:
# define the model

cnn_model = Sequential([
    Embedding(vocab_length, 100, weights = [embedding_matrix], input_length = 100, trainable = False),
    Conv1D(128, 5, activation = 'relu'),
    GlobalMaxPooling1D(),
    Dense(1, activation = 'sigmoid')
])

# compile the model

cnn_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])

print(cnn_model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 100)          224200    
                                                                 
 conv1d (Conv1D)             (None, 96, 128)           64128     
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 288457 (1.10 MB)
Trainable params: 64257 (251.00 KB)
Non-trainable params: 224200 (875.78 KB)
_________________________________________________________________
None


In [30]:
cnn_history = cnn_model.fit(X_train, Y_train, batch_size = 128, epochs = 5, verbose = 2, validation_split = 0.2)

Epoch 1/5
3/3 - 1s - loss: 0.6915 - acc: 0.5602 - val_loss: 0.5871 - val_acc: 0.7111 - 1s/epoch - 395ms/step
Epoch 2/5
3/3 - 0s - loss: 0.5368 - acc: 0.8235 - val_loss: 0.5383 - val_acc: 0.8333 - 286ms/epoch - 95ms/step
Epoch 3/5
3/3 - 0s - loss: 0.4439 - acc: 0.8964 - val_loss: 0.4941 - val_acc: 0.8000 - 293ms/epoch - 98ms/step
Epoch 4/5
3/3 - 0s - loss: 0.3757 - acc: 0.9076 - val_loss: 0.4617 - val_acc: 0.8444 - 292ms/epoch - 97ms/step
Epoch 5/5
3/3 - 0s - loss: 0.3275 - acc: 0.9524 - val_loss: 0.4416 - val_acc: 0.8333 - 311ms/epoch - 104ms/step


In [31]:
cnn_score = cnn_model.evaluate(X_test, Y_test, verbose = 1)



### LSTM

In [32]:
from keras.layers import LSTM

In [33]:
# define the model

rnn_model = Sequential([
    Embedding(vocab_length, 100, weights = [embedding_matrix], input_length = 100, trainable = False),
    LSTM(128),
    Dense(1, activation = 'sigmoid')
])

# compile the model

rnn_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])

print(rnn_model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 100)          224200    
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 341577 (1.30 MB)
Trainable params: 117377 (458.50 KB)
Non-trainable params: 224200 (875.78 KB)
_________________________________________________________________
None


In [34]:
rnn_history = rnn_model.fit(X_train, Y_train, batch_size = 128, epochs = 5, verbose = 2, validation_split = 0.2)

Epoch 1/5
3/3 - 5s - loss: 0.6923 - acc: 0.5574 - val_loss: 0.6872 - val_acc: 0.6333 - 5s/epoch - 2s/step
Epoch 2/5
3/3 - 1s - loss: 0.6855 - acc: 0.5966 - val_loss: 0.6781 - val_acc: 0.6444 - 943ms/epoch - 314ms/step
Epoch 3/5
3/3 - 1s - loss: 0.6774 - acc: 0.5966 - val_loss: 0.6605 - val_acc: 0.6444 - 934ms/epoch - 311ms/step
Epoch 4/5
3/3 - 1s - loss: 0.6710 - acc: 0.5994 - val_loss: 0.6509 - val_acc: 0.6444 - 923ms/epoch - 308ms/step
Epoch 5/5
3/3 - 1s - loss: 0.6690 - acc: 0.5994 - val_loss: 0.6513 - val_acc: 0.6444 - 977ms/epoch - 326ms/step


In [35]:
rnn_score = rnn_model.evaluate(X_test, Y_test, verbose = 1)



## Testing models

In [43]:
sen = [
    "i liked it",
    "nice",
    "i was buggy and lagging"
    ]

processed_data = []

for s in sen:
  processed_data.append(custom.preprocess_text(s))

processed_data = word_tokenizer.texts_to_sequences(processed_data)
processed_data = pad_sequences(processed_data, padding = 'post', maxlen = 100)

output = cnn_model.predict(processed_data)
print(output)

[[0.4553344 ]
 [0.46020716]
 [0.41867155]]
