## Prerequisite

In [1]:
# !pip install google-play-scraper
# !pip install nltk
# !pip install tensorflow

Import libraries

In [1]:
from google_play_scraper import app, Sort, reviews_all
import pandas as pd
import numpy as np
import json, os, uuid
from sklearn.model_selection import train_test_split

import re
import nltk
from nltk.corpus import stopwords
import tensorflow as tf
from keras.preprocessing.text import one_hot, Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dropout, Dense
from keras.layers import Flatten, GlobalMaxPooling1D, Embedding, Conv1D, LSTM

2024-03-15 10:35:19.936053: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-15 10:35:20.544696: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-15 10:35:20.547397: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Collect google play reviews using google-play-scraper

In [7]:
g_reviews = reviews_all(
        "mobicip.com.safeBrowserff", # application ID
        sleep_milliseconds=0, # defaults to 0
        lang='en', # defaults to 'en'
        country='us', # defaults to 'us'
        sort=Sort.NEWEST, # defaults to Sort.MOST_RELEVANT
    )

## Data pre-processing

### Arrange data into dataframes

In [8]:
g_df = pd.DataFrame(np.array(g_reviews),columns=['review'])
g_df2 = g_df.join(pd.DataFrame(g_df.pop('review').tolist()))

In [9]:
g_df2.drop(columns={'userImage', 'reviewCreatedVersion'},inplace = True) # drop unnecessary columns
g_df2.rename(columns= {'score': 'rating','userName': 'user_name', 'reviewId': 'review_id', 'content': 'review_description', 'at': 'review_date', 'replyContent': 'developer_response', 'repliedAt': 'developer_response_date', 'thumbsUpCount': 'thumbs_up'},inplace = True)
# g_df2.insert(loc=0, column='source', value='Google Play')
# g_df2.insert(loc=3, column='review_title', value=None)
# g_df2['laguage_code'] = 'en'
# g_df2['country_code'] = 'us'

In [10]:
g_df2.shape

(796, 9)

In [11]:
# storing into a .csv file for future use

g_df2.to_csv('play_reviews.csv')

In [12]:
g_df2['rating'].value_counts()

rating
1    352
5    321
2     53
3     36
4     34
Name: count, dtype: int64

In [13]:
g_df2.head()

Unnamed: 0,review_id,user_name,review_description,rating,thumbs_up,review_date,developer_response,developer_response_date,appVersion
0,9b96fe32-d8bc-448f-bf37-eaa79506a6b6,Tugsan Topcuoglu,Waste of time,1,0,2024-03-06 03:58:33,,NaT,2.2.8_r824
1,d5cb1046-ee6d-4fc5-ac03-c873e6ded9df,Roxanne LaRusso,I am extremely impressed with Mobicip's Custom...,5,0,2024-02-26 19:45:52,,NaT,
2,fe265e2e-117d-42b2-b8ca-63fc24e551b3,Williamcadder Mutevedzi,Friendly,5,0,2024-02-24 22:13:47,,NaT,
3,f44146dc-a7a3-445e-8183-6b153f482555,Justyna D,Does not work,1,0,2024-02-22 01:06:51,,NaT,2.2.7_r812
4,69bc7a7b-1cca-409f-96b1-8b19288ab427,Liam Whitwam,Invades privacy,1,0,2024-02-13 09:18:45,,NaT,2.2.4_r791


### Establishing data (X, Y)

In [3]:
# read data from csv

df = pd.read_csv('play_reviews.csv')

In [4]:
# create new column for binary classification

df['feedback'] = df['rating'].apply(lambda x: 0 if x < 3 else 1)

In [5]:
# declare variables

X = df['review_description']
Y = df['feedback']

In [6]:
# get count of vocabulary

print("Number of words: ")
print(len(np.unique(np.hstack(X))))

Number of words: 
789


In [7]:
# check if any null values are present in reviews part

X.isnull().values.any()

False

### Preparing data for embedding (Removal of stop words, punctuations, extra spaces)

In [29]:
stopwords_list = set(stopwords.words('english'))

In [30]:
class CustomPreprocess():

    def __init__(self):
        pass

    def preprocess_text(self,sen):
        sen = sen.lower()

        # # Remove html tags
        # sentence = remove_tags(sen)

        # Remove punctuations and numbers
        sen = re.sub('[^a-zA-Z]', ' ', sen)

        # Single character removal
        sen = re.sub(r"\s+[a-zA-Z]\s+", ' ', sen)

        # Remove multiple spaces
        sen = re.sub(r'\s+', ' ', sen)

        # Remove Stopwords
        pattern = re.compile(r'\b(' + r'|'.join(stopwords_list) + r')\b\s*')
        sen = pattern.sub('', sen)

        return sen

In [31]:
# define preprocessing

custom = CustomPreprocess()
X_new = []
sentences = list(X)
for sen in sentences:
  X_new.append(custom.preprocess_text(sen))

In [32]:
type(X_new[0])

str

In [34]:
# split data into training and testing sets

X_train, X_test, Y_train, Y_test = train_test_split(X_new, Y, test_size = 0.25)

In [None]:
# tokenize the sentences into arrays of words

word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_new)

X_train = word_tokenizer.texts_to_sequences(X_train)
X_test = word_tokenizer.texts_to_sequences(X_test)

In [None]:
vocab_length = len(word_tokenizer.word_index) + 1

vocab_length

2222

In [None]:
# Padding all reviews to fixed length 100

X_train = pad_sequences(X_train, padding='post', maxlen = 100)
X_test = pad_sequences(X_test, padding='post', maxlen = 100)

### Data Processing - 2

Remove punctuations, numbers and special characters

In [8]:
# remove any special characters
def remove_special_characters(text):
        pattern = r'[^a-zA-Z\s]'
        return re.sub(pattern, "", text)

X_2 = X.apply(remove_special_characters)

# remove any words whose length is less than 3
X_2 = X.apply(lambda x: ' '.join([word for word in x.split() if len(word) >= 3]))

Tokenization

In [9]:
tokens = X_2.apply(lambda x: x.split())

tokens.head()

0                                        [Waste, time]
1    [extremely, impressed, with, Mobicip's, Custom...
2                                           [Friendly]
3                                    [Does, not, work]
4                                   [Invades, privacy]
Name: review_description, dtype: object

Stemming

In [10]:
from nltk import PorterStemmer

ps = PorterStemmer()
tokens = tokens.apply(lambda x: [ps.stem(i) for i in x])

tokens.head()

0                                         [wast, time]
1    [extrem, impress, with, mobicip', custom, serv...
2                                           [friendli]
3                                     [doe, not, work]
4                                     [invad, privaci]
Name: review_description, dtype: object

In [11]:
for i in range(len(tokens)):
    tokens[i] = ' '.join(tokens[i])

X_2 = tokens
X_2.head()

0                                            wast time
1    extrem impress with mobicip' custom service. t...
2                                             friendli
3                                         doe not work
4                                        invad privaci
Name: review_description, dtype: object

Bag Of Words

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(stop_words='english')
bow = bow_vectorizer.fit_transform(X_2)
df_bow = pd.DataFrame(bow.todense())

df_bow.head()
# np.sum(df_bow[0])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2117,2118,2119,2120,2121,2122,2123,2124,2125,2126
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(X_2)
df_tfidf = pd.DataFrame(tfidf_matrix.todense())

df_tfidf.shape


(796, 2127)

In [14]:
Y.shape

(796,)

Train-test split

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(df_tfidf, Y, test_size = 0.2)

In [16]:
X_test.shape

(160, 2127)

## Word embeddings

### GloVe embeddings

In [20]:
# Load GloVe word embeddings and create an Embeddings Dictionary

from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('dataset/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions

glove_file.close()

In [21]:
# Create Embedding Matrix having 100 columns
# Containing 100-dimensional GloVe word embeddings for all words in our corpus.

embedding_matrix = zeros((vocab_length, 100))

for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

NameError: name 'vocab_length' is not defined

## Models

### ANN

In [19]:
# simple neural network

ann_model = Sequential([
    Embedding(vocab_length, 100, weights = [embedding_matrix], input_length = 100, trainable = False),
    Flatten(),
    Dense(1, activation = 'sigmoid'),
])

# compile the model

ann_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])

print(ann_model.summary())

NameError: name 'vocab_length' is not defined

In [26]:
# train the model

ann_history = ann_model.fit(X_train, Y_train, batch_size = 128, epochs = 5, verbose = 2, validation_split = 0.2)

Epoch 1/5
3/3 - 2s - loss: 0.7184 - acc: 0.4986 - val_loss: 0.6603 - val_acc: 0.6556 - 2s/epoch - 546ms/step
Epoch 2/5
3/3 - 0s - loss: 0.6390 - acc: 0.6387 - val_loss: 0.6391 - val_acc: 0.7000 - 96ms/epoch - 32ms/step
Epoch 3/5
3/3 - 0s - loss: 0.5859 - acc: 0.7703 - val_loss: 0.6262 - val_acc: 0.7444 - 106ms/epoch - 35ms/step
Epoch 4/5
3/3 - 0s - loss: 0.5490 - acc: 0.8039 - val_loss: 0.6136 - val_acc: 0.7667 - 96ms/epoch - 32ms/step
Epoch 5/5
3/3 - 0s - loss: 0.5201 - acc: 0.8487 - val_loss: 0.5988 - val_acc: 0.7667 - 138ms/epoch - 46ms/step


In [27]:
# get the loss and accuracy

ann_score = ann_model.evaluate(X_test, Y_test, verbose = 1)



### ANN - 2

In [17]:
X_train.shape

(636, 2127)

In [18]:
ann_model_2 = Sequential([
        Dense(X_train.shape[1], input_dim = X_train.shape[1], activation  = 'relu'),
        Dense(units = X_train.shape[1]//4, activation = 'relu'),
        Dense(1, activation = 'sigmoid')
])

ann_model_2.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])

print(ann_model_2.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 2127)              4526256   
                                                                 
 dense_1 (Dense)             (None, 531)               1129968   
                                                                 
 dense_2 (Dense)             (None, 1)                 532       
                                                                 
Total params: 5656756 (21.58 MB)
Trainable params: 5656756 (21.58 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


2024-03-15 10:36:18.414702: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-03-15 10:36:18.415276: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [19]:
ann_history_2 = ann_model_2.fit(X_train, Y_train, batch_size = 128, epochs = 10, verbose = 2, validation_split = 0.05)

Epoch 1/10
5/5 - 1s - loss: 0.6746 - acc: 0.6109 - val_loss: 0.6390 - val_acc: 0.7812 - 1s/epoch - 207ms/step
Epoch 2/10
5/5 - 0s - loss: 0.4823 - acc: 0.9570 - val_loss: 0.4956 - val_acc: 0.8750 - 316ms/epoch - 63ms/step
Epoch 3/10
5/5 - 0s - loss: 0.1988 - acc: 0.9719 - val_loss: 0.4067 - val_acc: 0.6875 - 313ms/epoch - 63ms/step
Epoch 4/10
5/5 - 0s - loss: 0.0594 - acc: 0.9934 - val_loss: 0.4654 - val_acc: 0.7812 - 315ms/epoch - 63ms/step
Epoch 5/10
5/5 - 0s - loss: 0.0164 - acc: 0.9983 - val_loss: 0.5931 - val_acc: 0.7812 - 309ms/epoch - 62ms/step
Epoch 6/10
5/5 - 0s - loss: 0.0061 - acc: 0.9983 - val_loss: 0.7412 - val_acc: 0.7500 - 310ms/epoch - 62ms/step
Epoch 7/10
5/5 - 0s - loss: 0.0012 - acc: 1.0000 - val_loss: 0.8538 - val_acc: 0.6875 - 315ms/epoch - 63ms/step
Epoch 8/10
5/5 - 0s - loss: 6.4965e-04 - acc: 1.0000 - val_loss: 0.9110 - val_acc: 0.7500 - 309ms/epoch - 62ms/step
Epoch 9/10
5/5 - 0s - loss: 3.0442e-04 - acc: 1.0000 - val_loss: 0.9458 - val_acc: 0.7500 - 310ms/epoc

In [20]:
ann_score_2 = ann_model_2.evaluate(X_test, Y_test, verbose = 1)



### CNN

In [29]:
from keras.layers import Conv1D

# define the model

cnn_model = Sequential([
    Embedding(vocab_length, 100, weights = [embedding_matrix], input_length = 100, trainable = False),
    Conv1D(128, 5, activation = 'relu'),
    GlobalMaxPooling1D(),
    Dense(1, activation = 'sigmoid')
])

# compile the model

cnn_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])

print(cnn_model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 100)          224200    
                                                                 
 conv1d (Conv1D)             (None, 96, 128)           64128     
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 288457 (1.10 MB)
Trainable params: 64257 (251.00 KB)
Non-trainable params: 224200 (875.78 KB)
_________________________________________________________________
None


In [30]:
cnn_history = cnn_model.fit(X_train, Y_train, batch_size = 128, epochs = 5, verbose = 2, validation_split = 0.2)

Epoch 1/5
3/3 - 1s - loss: 0.6915 - acc: 0.5602 - val_loss: 0.5871 - val_acc: 0.7111 - 1s/epoch - 395ms/step
Epoch 2/5
3/3 - 0s - loss: 0.5368 - acc: 0.8235 - val_loss: 0.5383 - val_acc: 0.8333 - 286ms/epoch - 95ms/step
Epoch 3/5
3/3 - 0s - loss: 0.4439 - acc: 0.8964 - val_loss: 0.4941 - val_acc: 0.8000 - 293ms/epoch - 98ms/step
Epoch 4/5
3/3 - 0s - loss: 0.3757 - acc: 0.9076 - val_loss: 0.4617 - val_acc: 0.8444 - 292ms/epoch - 97ms/step
Epoch 5/5
3/3 - 0s - loss: 0.3275 - acc: 0.9524 - val_loss: 0.4416 - val_acc: 0.8333 - 311ms/epoch - 104ms/step


In [31]:
cnn_score = cnn_model.evaluate(X_test, Y_test, verbose = 1)



### LSTM

In [32]:
from keras.layers import LSTM

In [33]:
# define the model

rnn_model = Sequential([
    Embedding(vocab_length, 100, weights = [embedding_matrix], input_length = 100, trainable = False),
    LSTM(128),
    Dense(1, activation = 'sigmoid')
])

# compile the model

rnn_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])

print(rnn_model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 100)          224200    
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 341577 (1.30 MB)
Trainable params: 117377 (458.50 KB)
Non-trainable params: 224200 (875.78 KB)
_________________________________________________________________
None


In [34]:
rnn_history = rnn_model.fit(X_train, Y_train, batch_size = 128, epochs = 5, verbose = 2, validation_split = 0.2)

Epoch 1/5
3/3 - 5s - loss: 0.6923 - acc: 0.5574 - val_loss: 0.6872 - val_acc: 0.6333 - 5s/epoch - 2s/step
Epoch 2/5
3/3 - 1s - loss: 0.6855 - acc: 0.5966 - val_loss: 0.6781 - val_acc: 0.6444 - 943ms/epoch - 314ms/step
Epoch 3/5
3/3 - 1s - loss: 0.6774 - acc: 0.5966 - val_loss: 0.6605 - val_acc: 0.6444 - 934ms/epoch - 311ms/step
Epoch 4/5
3/3 - 1s - loss: 0.6710 - acc: 0.5994 - val_loss: 0.6509 - val_acc: 0.6444 - 923ms/epoch - 308ms/step
Epoch 5/5
3/3 - 1s - loss: 0.6690 - acc: 0.5994 - val_loss: 0.6513 - val_acc: 0.6444 - 977ms/epoch - 326ms/step


In [35]:
rnn_score = rnn_model.evaluate(X_test, Y_test, verbose = 1)



## Testing models

In [21]:
sen = pd.read_csv('dataset/test_dataset.csv')
print("testing dataset - ")
print(sen.head())

test = sen['test_reviews'].apply(remove_special_characters)
test = sen['test_reviews'].apply(lambda x: ' '.join([word for word in x.split() if len(word) >= 3]))
tokens = test.apply(lambda x: x.split())
for i in range(len(tokens)):
    tokens[i] = ' '.join(tokens[i])
test = tokens

tfidf_test = tfidf.transform(test)
test_df = pd.DataFrame(tfidf_test.todense())

output = ann_model_2.predict(test_df)

print(*output)

testing dataset - 
               test_reviews
0                i liked it
1                      nice
2  it was buggy and lagging
3            waste of money
4                  worth it
[0.40803406] [0.99945974] [0.01426365] [0.0013396] [0.9977129]
