In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from collections import defaultdict
from surprise import SVD
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

#### Importing all the libraries

In [2]:
d1 = pd.read_csv('/Users/chopra/Documents/GL/Data Set/phone_user_review_file_1.csv', encoding = "ISO-8859-1",engine='python')
d2 = pd.read_csv('/Users/chopra/Documents/GL/Data Set/phone_user_review_file_2.csv', encoding = "ISO-8859-1",engine='python')
d3 = pd.read_csv('/Users/chopra/Documents/GL/Data Set/phone_user_review_file_3.csv', encoding = "ISO-8859-1",engine='python')
d4 = pd.read_csv('/Users/chopra/Documents/GL/Data Set/phone_user_review_file_4.csv', encoding = "ISO-8859-1",engine='python')
d5 = pd.read_csv('/Users/chopra/Documents/GL/Data Set/phone_user_review_file_5.csv', encoding = "ISO-8859-1",engine='python')
d6 = pd.read_csv('/Users/chopra/Documents/GL/Data Set/phone_user_review_file_6.csv', encoding = "ISO-8859-1",engine='python')


#### Merging all the files into one

In [3]:
df = pd.concat([d1,d2,d3,d4,d5,d6],axis=0)
df.head()

Unnamed: 0,phone_url,date,lang,country,source,domain,score,score_max,extract,author,product
0,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Verizon Wireless,verizonwireless.com,10.0,10.0,As a diehard Samsung fan who has had every Sam...,CarolAnn35,Samsung Galaxy S8
1,/cellphones/samsung-galaxy-s8/,4/28/2017,en,us,Phone Arena,phonearena.com,10.0,10.0,Love the phone. the phone is sleek and smooth ...,james0923,Samsung Galaxy S8
2,/cellphones/samsung-galaxy-s8/,5/4/2017,en,us,Amazon,amazon.com,6.0,10.0,Adequate feel. Nice heft. Processor's still sl...,R. Craig,"Samsung Galaxy S8 (64GB) G950U 5.8"" 4G LTE Unl..."
3,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Samsung,samsung.com,9.2,10.0,Never disappointed. One of the reasons I've be...,Buster2020,Samsung Galaxy S8 64GB (AT&T)
4,/cellphones/samsung-galaxy-s8/,5/11/2017,en,us,Verizon Wireless,verizonwireless.com,4.0,10.0,I've now found that i'm in a group of people t...,S Ate Mine,Samsung Galaxy S8


#### Check a few observations and shape of the data-frame.

In [4]:
df.shape

(1415133, 11)

In [5]:
df.dtypes

phone_url     object
date          object
lang          object
country       object
source        object
domain        object
score        float64
score_max    float64
extract       object
author        object
product       object
dtype: object

In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
score,1351644.0,8.00706,2.616121,0.2,7.2,9.2,10.0,10.0
score_max,1351644.0,10.0,0.0,10.0,10.0,10.0,10.0,10.0


In [7]:
df.isna().sum()

phone_url        0
date             0
lang             0
country          0
source           0
domain           0
score        63489
score_max    63489
extract      19361
author       63202
product          1
dtype: int64

In [8]:
df = df.fillna(df.median())


In [9]:
df.isna().sum()

phone_url        0
date             0
lang             0
country          0
source           0
domain           0
score            0
score_max        0
extract      19361
author       63202
product          1
dtype: int64

#### We still have some missing values in author, and exract but we'll be using score mainly for the model building process. So we can drop these columns. 


In [10]:
df = df.dropna()
df.isna().sum()

phone_url    0
date         0
lang         0
country      0
source       0
domain       0
score        0
score_max    0
extract      0
author       0
product      0
dtype: int64

In [11]:
df.duplicated().sum()

4809

#### there seems to be many duplicate records in the dataframe.

In [12]:
df = df.drop_duplicates()
df.duplicated().sum()

0

#### Rounding off score to the nearest integer

In [13]:
df.score = round(df.score)

In [14]:
df.head()

Unnamed: 0,phone_url,date,lang,country,source,domain,score,score_max,extract,author,product
0,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Verizon Wireless,verizonwireless.com,10.0,10.0,As a diehard Samsung fan who has had every Sam...,CarolAnn35,Samsung Galaxy S8
1,/cellphones/samsung-galaxy-s8/,4/28/2017,en,us,Phone Arena,phonearena.com,10.0,10.0,Love the phone. the phone is sleek and smooth ...,james0923,Samsung Galaxy S8
2,/cellphones/samsung-galaxy-s8/,5/4/2017,en,us,Amazon,amazon.com,6.0,10.0,Adequate feel. Nice heft. Processor's still sl...,R. Craig,"Samsung Galaxy S8 (64GB) G950U 5.8"" 4G LTE Unl..."
3,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Samsung,samsung.com,9.0,10.0,Never disappointed. One of the reasons I've be...,Buster2020,Samsung Galaxy S8 64GB (AT&T)
4,/cellphones/samsung-galaxy-s8/,5/11/2017,en,us,Verizon Wireless,verizonwireless.com,4.0,10.0,I've now found that i'm in a group of people t...,S Ate Mine,Samsung Galaxy S8


#### Keep only 1000000 data samples. Use random state=612.

In [15]:
df_slice = df.sample(n=1000000, random_state=612)

#### Drop irrelevant features. Keep features like Author, Product, and Score.

In [16]:
df_slice = df_slice[["author","product","score"]]
df_slice.head()

Unnamed: 0,author,product,score
139931,Nestor,Smartphone Asus Zenfone 3 ZE520KL - 1B075BR Oc...,10.0
279854,Clark,"BlackBerry DTEK50 Unlocked GSM 5.2"" Touchscree...",10.0
143967,AndrÃ© Ryodi Nogami,Smartphone Samsung Galaxy A5 2016 Duos A510 De...,9.0
155405,Tiffany Stevenson,Sony Ericsson XPERIA X10 Mini Pro (U20i) Unloc...,4.0
206640,xceday,Nokia 3720 Classic cep telefonu,9.0


# 2

#### Identify the most rated features.

In [17]:
topproduct = pd.DataFrame(df_slice.groupby('product')['score'].mean().sort_values(ascending=False))
topproduct.head()

Unnamed: 0_level_0,score
product,Unnamed: 1_level_1
"Nokia XL - Smartphone libre Android (pantalla 5"", cÃ¡mara 5 Mp, 4 GB, Dual-Core 1 GHz, 768 MB RAM), naranja [importado]",10.0
HTC Desire 820 Bleu,10.0
"HTC Desire 816 Dual Sim 8GB Unlocked Android Smartphone Import, Retail Packaging, White",10.0
"NUWA Apple Lightning to USB (16GB USB Flash Drive) storage U disk Adapter connecter charger. For macbook pro mac Apple iPhone 5 / 5C / 5S 6, iPad Air, iPad mini,iPad 4, iPod Nano (7th) iPod touch (5th) 8 Pin connector-White",10.0
Samsung Galaxy S5 (SM-G900) Charcoal Black,10.0


####  Identify the users with most number of reviews.

In [18]:
topusers = pd.DataFrame(df_slice['author'].value_counts())
topusers.head()

Unnamed: 0,author
Amazon Customer,57792
Cliente Amazon,14624
e-bit,6317
Client d'Amazon,5698
Amazon Kunde,3574


#### products having more than 50 ratings

In [19]:
#products having more than 50 ratings

prod50 = pd.DataFrame(columns=['product','count'])
prod50['product'] = df_slice['product'].value_counts().index.tolist()
prod50['count'] = list(df_slice['product'].value_counts() > 50)

names = prod50[ prod50['count'] == False ].index 
prod50.drop(names, inplace = True)
prod50

Unnamed: 0,product,count
0,"Lenovo Vibe K4 Note (White,16GB)",True
1,"Lenovo Vibe K4 Note (Black, 16GB)",True
2,"OnePlus 3 (Graphite, 64 GB)",True
3,"OnePlus 3 (Soft Gold, 64 GB)",True
4,Samsung Galaxy Express I8730,True
...,...,...
4375,"LG Lucid 2, Black 8GB (Verizon Wireless)",True
4376,Samsung Galaxy S4 mini (i9195) âAndroid-puhe...,True
4377,Samsung Galaxy S6 SM-G920F 32GB,True
4378,LG Electronics WineSmart klappbares Smartphone...,True


#### Authors with more than 50 ratings

In [20]:
#authors with more than 50 ratings

auth50 = pd.DataFrame(columns=['author','count'])
auth50['author'] = df_slice['author'].value_counts().index.tolist()
auth50['count'] = list(df_slice['author'].value_counts() > 50)

names = auth50[ auth50['count'] == False ].index 
auth50.drop(names, inplace = True)
auth50

Unnamed: 0,author,count
0,Amazon Customer,True
1,Cliente Amazon,True
2,e-bit,True
3,Client d'Amazon,True
4,Amazon Kunde,True
...,...,...
669,federico,True
670,ÐÐ°ÑÐµÑÐ¸Ð½Ð°,True
671,Bezlikiy_Djo,True
672,Carla,True


#### Products having more than 50 ratings and users who have given more than 50 ratings.


In [21]:
#products having more than 50 ratings and users who have given more than 50 ratings.

prod_user_50 = df_slice[df_slice['product'].isin(prod50['product'])] 
prod_user_50

Unnamed: 0,author,product,score
4635,Dieter Kraft,Microsoft Nokia Lumia 630 Single-SIM Smartphon...,10.0
300224,derrrickman,Kyocera Kona Black (Virgin Mobile),10.0
352217,mario,"Samsung G900 Galaxy S5 Smartphone, 16 GB, Nero...",8.0
66629,Darpan,"OnePlus One (Sandstone Black, 64GB)",8.0
77416,Amazon Customer,"OnePlus 3 (Graphite, 64 GB)",10.0
...,...,...,...
167436,vikaaay,Nokia C3-00,6.0
179434,missylee,Huawei P8 Lite wit / 16 GB,9.0
87170,Sergio,Huawei Ascend Y330 - Smartphone libre Android ...,6.0
281617,matteo anatrella,"Huawei Ascend G510 Smartphone Touch, Fotocamer...",10.0


## Shape of data with products having more than 50 ratings and users who have given more than 50 ratings.

In [22]:
prod_user_50.shape

(559763, 3)

# 3. Popularity based model

In [23]:
mean_rating = pd.DataFrame(df_slice.groupby('product')['score'].mean().sort_values(ascending = False))

In [24]:
mean_rating['count'] = pd.DataFrame(df.groupby('product')['score'].count())  

In [25]:
mean_rating.head()

Unnamed: 0_level_0,score,count
product,Unnamed: 1_level_1,Unnamed: 2_level_1
"Nokia XL - Smartphone libre Android (pantalla 5"", cÃ¡mara 5 Mp, 4 GB, Dual-Core 1 GHz, 768 MB RAM), naranja [importado]",10.0,1
HTC Desire 820 Bleu,10.0,2
"HTC Desire 816 Dual Sim 8GB Unlocked Android Smartphone Import, Retail Packaging, White",10.0,1
"NUWA Apple Lightning to USB (16GB USB Flash Drive) storage U disk Adapter connecter charger. For macbook pro mac Apple iPhone 5 / 5C / 5S 6, iPad Air, iPad mini,iPad 4, iPod Nano (7th) iPod touch (5th) 8 Pin connector-White",10.0,2
Samsung Galaxy S5 (SM-G900) Charcoal Black,10.0,4


### Recommending top 5 mobiles 

In [26]:
# Recommending top 5 mobiles 

mean_rating.sort_values(by=['score','count'], ascending = False).head(5)

Unnamed: 0_level_0,score,count
product,Unnamed: 1_level_1,Unnamed: 2_level_1
Samsung Galaxy Note5,10.0,200
Motorola Smartphone Motorola Moto X Desbloqueado Preto Android 4.2.2 CÃ¢mera 10MP e Frontal 2MP MemÃ³ria Interna de 16GB GSM,10.0,186
Motorola Smartphone Motorola Moto G Dual Chip Desbloqueado TIM Android 4.3 Tela 4.5 8GB 3G Wi-Fi CÃ¢mera 5MP - Preto,10.0,173
Samsung Smartphone Dual Chip Samsung Galaxy SIII Duos Desbloqueado Claro Azul Android 4.1 3G/Wi-Fi CÃ¢mera 5MP,10.0,171
Nokia Smartphone Nokia Lumia 520 Desbloqueado Oi Preto Windows Phone 8 CÃ¢mera 5MP 3G Wi-Fi MemÃ³ria Interna 8G GPS,10.0,170


### Build a collaborative filtering model using SVD.


In [27]:
#Build a collaborative filtering model using SVD.
columns_titles = ['author','product','score']
df_svd = df_slice.reindex(columns=columns_titles)



In [28]:
df_svd_slice = df_svd.sample(n=5000, random_state=612)

In [29]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df_svd_slice,reader = reader)

In [30]:
trainset = data.build_full_trainset()

In [31]:
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb2d85df880>

#### Making predictions on Test set

In [32]:
testset = trainset.build_anti_testset()

In [33]:
predictions = algo.test(testset)

In [34]:
def get_top_n(predictions, n=5):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [35]:
top_n = get_top_n(predictions, n=5)

In [36]:
top_n

defaultdict(list,
            {'Utsav': [('Nokia N95', 9.419338389285981),
              ('Lenovo Motorola Moto G LTE Smartphone, Display 4.5", Fotocamera 5 MP, Memoria 8 GB, Quad-Core 1.2 GHz, 1 GB RAM, Micro SD, Android 4.4.3, Bianco [Italia]',
               8.952476608295653),
              ('Samsung Galaxy S7 edge 32GB (Verizon)', 8.910491330664787),
              ('Apple iPhone', 8.867057238715395),
              ('Samsung Galaxy S7 edge 32GB (T-Mobile)', 8.841334584239759)],
             'Sandeep': [('Samsung Galaxy S7 edge 32GB (Verizon)',
               8.721247761699647),
              ('Nokia N95', 8.64163324188001),
              ('Samsung Galaxy S7 edge 32GB (AT&T)', 8.47340072891233),
              ('Samsung Galaxy S7 edge 32GB (T-Mobile)', 8.385936709260061),
              ('Apple iPhone', 8.355054334526965)],
             'Erdbeerbluete': [('Samsung Galaxy S7 edge 32GB (Verizon)',
               9.174602144599117),
              ('Samsung Galaxy J3 (8GB)', 8.98487208114

In [37]:
#Recommended item for each user

for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

Utsav ['Nokia N95', 'Lenovo Motorola Moto G LTE Smartphone, Display 4.5", Fotocamera 5 MP, Memoria 8 GB, Quad-Core 1.2 GHz, 1 GB RAM, Micro SD, Android 4.4.3, Bianco [Italia]', 'Samsung Galaxy S7 edge 32GB (Verizon)', 'Apple iPhone', 'Samsung Galaxy S7 edge 32GB (T-Mobile)']
Sandeep ['Samsung Galaxy S7 edge 32GB (Verizon)', 'Nokia N95', 'Samsung Galaxy S7 edge 32GB (AT&T)', 'Samsung Galaxy S7 edge 32GB (T-Mobile)', 'Apple iPhone']
Erdbeerbluete ['Samsung Galaxy S7 edge 32GB (Verizon)', 'Samsung Galaxy J3 (8GB)', 'Nokia N95', 'Samsung Galaxy S7 edge 32GB (AT&T)', 'Apple iPhone']
karen singleton ['Nokia N95', 'Samsung Galaxy S7 edge 32GB (Verizon)', 'Apple iPhone', 'Samsung Galaxy S7 edge 32GB (T-Mobile)', 'Nokia E72']
Six ['Nokia N95', 'Lenovo Motorola Moto G LTE Smartphone, Display 4.5", Fotocamera 5 MP, Memoria 8 GB, Quad-Core 1.2 GHz, 1 GB RAM, Micro SD, Android 4.4.3, Bianco [Italia]', 'Samsung Galaxy S7 edge 32GB (Verizon)', 'Samsung Galaxy S7 32GB (AT&T)', 'Samsung Galaxy S7 edge 

In [38]:
pred = algo.predict('Raquel', 'Nokia N95', verbose=True)


user: Raquel     item: Nokia N95  r_ui = None   est = 8.90   {'was_impossible': False}


In [39]:
red = algo.predict('Raquel', 'Samsung Galaxy S7 edge 32GB (AT&T)', verbose=True)

user: Raquel     item: Samsung Galaxy S7 edge 32GB (AT&T) r_ui = None   est = 8.55   {'was_impossible': False}


### For user 'Raquel' we can see that the estimated rating for product 'Nokia N95' is 8.84; and product 'Samsung Galaxy s7 edge' is 8.78

#### Evaluate the collaborative model. Print RMSE value

### RMSE 

In [40]:
print("SVD Model : Test Set")
accuracy.rmse(predictions, verbose=True)

SVD Model : Test Set
RMSE: 0.3302


0.33015324700677645

#### RMSE for above data is 0.3306

In [41]:
cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

{'test_rmse': array([2.51289981, 2.50499446, 2.54872606]),
 'fit_time': (0.3151559829711914, 0.27463603019714355, 0.2794361114501953),
 'test_time': (0.015092849731445312,
  0.019617080688476562,
  0.015522241592407227)}

### RMSE of SVD model is lower than for cross validation.



### 10. In what business scenario you should use popularity based Recommendation Systems ?

We can use popularity based recommendation systems in wide range of business scenarios such as:
1. Shopping.
2. Streaming.
3. Trading
4. e-commerce
5. News etc.

### 11. In what business scenario you should use CF based Recommendation Systems ?

1. Shopping
2. Streaming Music/ videos.
3. Sales
4. Trading.
5. Matrimony etc.

### 12. What other possible methods can you think of which can further improve the recommendation for different users ?

We can give better results by knowing more about the user. We can appply techniques like KNN to find out user's interests, previous buying or watching history and suggest something that is relevant for the user.

We can also ask follow up questions from users asking whether the user is satisfied with the recomendations or not, and make the necessary changes. 

We can measure screen time, mouse track, clicks, etc to further collect data of the user and serve him with the content or products that he will be interested in. 

we can also set up a time based recomendation system that suggests users content based on his timezone. Like music that might help you sleep can be suggested later in the evening, songs according to the weather of the location of user etc.