In [None]:
import os
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Conv2D, Flatten, MaxPooling2D, BatchNormalization, Dropout, LSTM, Activation
from tensorflow.keras.models import Sequential
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.utils import to_categorical

from sklearn.metrics import classification_report,confusion_matrix

from cv2 import imread, resize


# Task 1

In [2]:
df = pd.read_csv('../googleplaystore.csv')
row_10472 = df.loc[10472,:]
df.loc[10472,:] = [row_10472['App'], row_10472['Content Rating'], row_10472['Category'], row_10472['Rating'], row_10472['Reviews'],
                   row_10472['Size'], row_10472['Installs'], row_10472['Type'], row_10472['Price'], row_10472['Android Ver'], 
                   row_10472['Genres'], row_10472['Last Updated'], row_10472['Current Ver']]

df.drop(['Type','Category','Current Ver','Android Ver'],axis=1, inplace=True)

lastUpdated = pd.to_datetime(df['Last Updated'])
df.loc[:,'Last Updated'] = pd.to_datetime(df.loc[:,'Last Updated'])
df.loc[:,'Last Updated'] = df.loc[:,'Last Updated'].map(lambda x: (datetime.today()-x).days)

df.loc[:,'Size'] = df.loc[:,'Size'].map(lambda x: float(x[:-1])*1000000 if x[-1] == 'M' else float(x[:-1])*1000 if x[-1] == 'k' else x)

#replacing $ in price
df['Price'].replace(
    {r'\$([0-9]*\.*[0-9]*)': r'\1'},
    inplace = True,
    regex = True)

#replacing , in installs
df['Installs'].replace(
    {r'([0-9]*)\,*([0-9]*)\,*([0-9]*)\,*([0-9]*)\+': r'\1\2\3\4'},
    inplace = True,
    regex = True)

#where varies with device 
df['Size'].replace("Varies with device", float("NaN"), inplace=True)
df.drop(df[pd.to_numeric(df.Size, errors = 'coerce').isnull()].index,
                  axis = 0,
                  inplace = True)
df.dropna(inplace=True)

#categorical values encoding (using label encoding https://pbpython.com/categorical-encoding.html#approach-2-label-encoding)
#print(df['Genres'].unique())
df = df.astype({
    'Genres':             'category',
    'Content Rating':     'category'
})

df['Genres Category'] = df['Genres'].cat.codes
df['Content Rating Category'] = df['Content Rating'].cat.codes

feature_columns = ['Reviews', 'Size', 'Installs',  'Price', 'Content Rating Category', 'Genres Category', 'Last Updated']
output_column = ['Rating']
relevant_data = df[feature_columns + output_column]
relevant_data = relevant_data.apply(pd.to_numeric)

#spliting in viewer sympathy in 2 classes by rating
relevant_data.Rating = relevant_data.Rating.map(lambda x: 1 if x>=4.0 else 0)

In [3]:
num_classes = 2

X_train, X_test, y_train, y_test = train_test_split(
    relevant_data[feature_columns], relevant_data[output_column], test_size = 0.5, random_state = 0
)

X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
# print(y_test)

In [4]:

model = Sequential()
model.add(Dense(256, input_dim = len(feature_columns)))
model.add(Activation('relu'))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(2, activation = 'sigmoid'))
model.compile(
    loss = 'categorical_crossentropy',
    optimizer = 'sgd',
    metrics=['accuracy']
)

history = model.fit(X_train,
                    y_train,
                    epochs=10,
                    batch_size=32,                                   
                    validation_split=0.2,
                    verbose = 1)

score = model.evaluate(X_test, y_test, verbose=0) 
print('\nTest score:', score[0]) 
print('Test accuracy:', score[1])

2022-01-05 16:05:59.292729: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib/python3.9/site-packages/cv2/../../lib64:
2022-01-05 16:05:59.292773: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-01-05 16:05:59.292801: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (b2b9b6b183c4): /proc/driver/nvidia/version does not exist
2022-01-05 16:05:59.293066: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Test score: 0.5231268405914307
Test accuracy: 0.7673997282981873


# Task 2

In [5]:
imgs_per_celeb = 100
path = '../Sports-celebrity-imgs/'
dirs = ['Kane Williamson', 'Kobe Bryant', 'Maria Sharapova',]# 'Ronaldo']
X = []
y = []
count = 0
for d in dirs:
    full_path = path + d
    for image in os.listdir(full_path):
        img = imread((os.path.join(full_path, image)), 1)[...,::-1] / 255
        img = resize(img, (100, 100))
        X.append(img)
        y.append(count)
    count += 1
    
random.seed(1)
random.shuffle(X)
# reset the same seed to get the identical random sequence and shuffle the y
random.seed(1)
random.shuffle(y)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 13)
X_train = np.array(X_train)
y_train = to_categorical(np.array(y_train))
X_test = np.array(X_test)
y_test = to_categorical(np.array(y_test))

In [7]:
model = Sequential()
    
model.add(Conv2D(32, (3, 3), input_shape = (100, 100, 3)))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size = (2, 2)))

model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size = (2, 2)))

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size = (2, 2)))

model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation = 'softmax'))

model.compile(
        loss = 'categorical_crossentropy',
        optimizer = 'rmsprop',
        metrics = [keras.metrics.Recall(name = 'recall'), 
                   keras.metrics.Accuracy(name = 'accuracy')]
    )

history = model.fit(X_train,
                    y_train,
                    epochs=10,
                    batch_size=16,                                   
                    validation_split=0.2,
                    verbose = 1)

score = model.evaluate(X_test, y_test, verbose=0) 
print('\nTest score:', score[0]) 
print('Test accuracy:', score[1])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Test score: 4.50176477432251
Test accuracy: 0.34328359365463257


# Task 3

In [8]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
df = pd.read_csv('../Corona_NLP_train.csv')
df = df[:5000]
df_test = pd.read_csv('../Corona_NLP_test.csv')
df_test = df_test[:5000]

def preprocess_df(df):
    df['OriginalTweet'].replace(
        {r'[^a-zA-Z\ ]': r''},
        inplace = True,
        regex = True)

    stop_words = stopwords.words('english')

    def filter_stop_words(text_tokens):
        filtered_text = []
        for w in text_tokens:
            if w not in stop_words:
                filtered_text.append(w)
        return filtered_text
    
    def to_int_category(x):
        if x == 'Negative':
            return 0
        elif x == 'Neutral':
            return 1
        elif x == 'Positive':
            return 2

    df.loc[:,'OriginalTweetPreprocessed'] = df.loc[:,'OriginalTweet'].map(lambda x: " ".join(filter_stop_words(word_tokenize(x))))
    df.loc[:,'Sentiment'] = df.loc[:,'Sentiment'].map(lambda x: x.replace('Extremely ', ''))
    df.loc[:,'Sentiment'] = df.loc[:,'Sentiment'].map(to_int_category)
    # words = word_tokenize(text)
#     df = df[['OriginalTweetPreprocessed', 'OriginalTweet', 'Sentiment']]
    df.drop(['UserName', 'ScreenName', 'Location', 'TweetAt'], axis=1, inplace=True)

preprocess_df(df)
preprocess_df(df_test)

In [9]:
df.head()

Unnamed: 0,OriginalTweet,Sentiment,OriginalTweetPreprocessed
0,MeNyrbie PhilGahan Chrisitv httpstcoiFzFAnPa a...,1,MeNyrbie PhilGahan Chrisitv httpstcoiFzFAnPa h...
1,advice Talk to your neighbours family to excha...,2,advice Talk neighbours family exchange phone n...
2,Coronavirus Australia Woolworths to give elder...,2,Coronavirus Australia Woolworths give elderly ...
3,My food stock is not the only one which is emp...,2,My food stock one emptyPLEASE dont panic THERE...
4,Me ready to go at supermarket during the COVID...,0,Me ready go supermarket COVID outbreakNot Im p...


In [28]:
train_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(df['OriginalTweetPreprocessed'].values, tf.string),
            tf.cast(df['Sentiment'].values, tf.int64)
        )
    )
)

test_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(df_test['OriginalTweetPreprocessed'].values, tf.string),
            tf.cast(df_test['Sentiment'].values, tf.int64)
        )
    )
)

In [29]:
import tensorflow as tf

BUFFER_SIZE = 10000
BATCH_SIZE = 64

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [30]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(df.OriginalTweetPreprocessed)

In [33]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
#         input_dim=61,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [34]:
history = model.fit(train_dataset, epochs=30,
                    validation_data=test_dataset,
                    validation_steps=30)

Epoch 1/30












Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [35]:
test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)


Test Loss: -104.16022491455078
Test Accuracy: 0.2983148992061615
