# Part 3: Recommender System
---

### Contents:
- [Import libraries](#Import-libraries)
- [Recommender System](#Recommender-System)

## Import libraries

In [1]:
# import libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [2]:
df = pd.read_csv('data/df_model.csv')
df.head()

Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
0,15000.0,2013.0,ford,excellent,6 cylinders,gas,128000.0,clean,automatic,rwd,full-size,truck,black,al
1,4500.0,1992.0,jeep,excellent,6 cylinders,gas,192000.0,clean,automatic,4wd,full-size,sedan,white,al
2,14000.0,2012.0,honda,excellent,6 cylinders,gas,95000.0,clean,automatic,fwd,full-size,mini-van,silver,al
3,15000.0,2017.0,dodge,excellent,8 cylinders,gas,90000.0,rebuilt,automatic,rwd,mid-size,sedan,grey,al
4,3000.0,2004.0,chrysler,good,6 cylinders,gas,176144.0,clean,automatic,fwd,mid-size,mini-van,silver,al


In [3]:
df.isnull().sum()

price           0
year            0
manufacturer    0
condition       0
cylinders       0
fuel            0
odometer        0
title_status    0
transmission    0
drive           0
size            0
type            0
paint_color     0
state           0
dtype: int64

## Recommender System

### Search by brand base on given features

In [4]:
def brand_recommender(manufacturer,price_range,n=6):
    data = df.loc[(df['manufacturer']==manufacturer)
                 & ((df['price']>=price_range[0]) & (df['price']<=price_range[1]))]
    data.reset_index(level =0, inplace = True)
    
    # Top n car recommendations
    car_recommend = data[['price','manufacturer','type','year','condition','fuel','title_status',
                'transmission','paint_color']].sample(n, ignore_index=True, random_state=42)
    return car_recommend.sort_values(by ='price', ignore_index=True)

In [5]:
brand_recommender('toyota',(10000, 20000),7)

Unnamed: 0,price,manufacturer,type,year,condition,fuel,title_status,transmission,paint_color
0,10995.0,toyota,SUV,2008.0,excellent,gas,clean,automatic,white
1,12987.0,toyota,sedan,2012.0,excellent,gas,clean,automatic,white
2,14998.0,toyota,SUV,2012.0,excellent,gas,clean,automatic,white
3,14999.0,toyota,SUV,2013.0,excellent,gas,clean,automatic,white
4,16999.0,toyota,truck,2010.0,good,gas,clean,manual,silver
5,17995.0,toyota,sedan,2014.0,like new,gas,clean,automatic,red
6,19500.0,toyota,SUV,2013.0,excellent,gas,clean,automatic,white


### Search by price 

In [6]:
def price_recommender(price_range,n):
    data = df.loc[(df['price']>=price_range[0]) & (df['price']<=price_range[1])]
    data.reset_index(level =0, inplace = True)
   
    # Top n car recommendations
    car_recommend = data[['price','manufacturer','type','year','condition','fuel','title_status',
                'transmission','paint_color']].sample(n, ignore_index=True, random_state=42)
    return car_recommend.sort_values(by ='price', ignore_index=True)

In [7]:
price_recommender((10000, 20000),7)

Unnamed: 0,price,manufacturer,type,year,condition,fuel,title_status,transmission,paint_color
0,12998.0,nissan,sedan,2018.0,excellent,gas,clean,automatic,white
1,16500.0,buick,SUV,2014.0,excellent,gas,clean,automatic,white
2,16900.0,jeep,SUV,2018.0,good,gas,clean,other,white
3,16990.0,honda,sedan,2015.0,good,other,clean,other,silver
4,17337.0,toyota,hatchback,2018.0,good,gas,clean,automatic,white
5,18500.0,audi,SUV,2015.0,excellent,diesel,clean,automatic,black
6,18995.0,land rover,SUV,2012.0,good,gas,clean,automatic,black


In [8]:
price_recommender((5000,10000),7)

Unnamed: 0,price,manufacturer,type,year,condition,fuel,title_status,transmission,paint_color
0,5000.0,subaru,wagon,2005.0,excellent,gas,clean,manual,blue
1,5995.0,nissan,SUV,2006.0,excellent,gas,clean,automatic,red
2,6999.0,subaru,wagon,2008.0,excellent,gas,clean,automatic,white
3,7500.0,honda,SUV,2009.0,good,gas,clean,automatic,brown
4,7500.0,kia,wagon,2013.0,excellent,gas,clean,automatic,black
5,8500.0,ford,sedan,2011.0,excellent,gas,clean,automatic,white
6,8995.0,toyota,mini-van,2005.0,excellent,gas,clean,automatic,silver


### Search with all features

In [9]:
def all_recommender(manufacturer,year,car_type,price_range,n):
    data = df.loc[(df['year']==year) & (df['type']==car_type) &(df['manufacturer']==manufacturer)
                 & ((df['price']>=price_range[0]) & (df['price']<=price_range[1]))]
    data.reset_index(level =0, inplace = True)
   
    # Top n car recommendations
    car_recommend = data[['price','manufacturer','type','year','condition','fuel','title_status'
                ,'transmission','paint_color']].sample(n, ignore_index=True, random_state=42)
    return car_recommend.sort_values(by ='price', ignore_index=True)

In [10]:
all_recommender('acura',2007,'sedan',(5000, 10000),7)

Unnamed: 0,price,manufacturer,type,year,condition,fuel,title_status,transmission,paint_color
0,5500.0,acura,sedan,2007.0,excellent,gas,rebuilt,automatic,white
1,5950.0,acura,sedan,2007.0,excellent,gas,clean,automatic,grey
2,6900.0,acura,sedan,2007.0,excellent,gas,clean,automatic,blue
3,7500.0,acura,sedan,2007.0,excellent,gas,clean,automatic,black
4,7995.0,acura,sedan,2007.0,excellent,gas,clean,automatic,blue
5,7995.0,acura,sedan,2007.0,good,gas,clean,automatic,silver
6,8550.0,acura,sedan,2007.0,excellent,gas,clean,automatic,white


### Recommender system with cosine similarity (Thank you Eric)

In [11]:
# in the cylinders column remove the word 'cylinder' to make it numerical
df['cylinders'].replace('other','0 cylinders', inplace=True)
df['cylinders']=df['cylinders'].apply(lambda x:x.split(' ')[0])
df['cylinders']=df['cylinders'].astype('int')
df['cylinders'].dtype

dtype('int64')

In [12]:
manufacturer_data = df[['manufacturer','price', 'year','cylinders','type']]

In [13]:
# cosine similarity could not run for the large amount of data
# due to the limit of computer memory, limit the data for cosine similarity 
# limit the data by limit the price

manufacturer_data = manufacturer_data.loc[manufacturer_data['price']<5000]
#manufacturer_data.reset_index(level =0, inplace = True)

In [14]:
manufacturer_data.shape

(38962, 5)

In [15]:
ohe = OneHotEncoder(drop='first')

In [16]:
ohe.fit(manufacturer_data[['manufacturer','type']])

In [17]:
ohe_data = ohe.transform(manufacturer_data[['manufacturer','type']])

In [18]:
ohe_df = pd.DataFrame(ohe_data.todense(), columns=ohe.get_feature_names_out())
ohe_df.head()

Unnamed: 0,manufacturer_alfa-romeo,manufacturer_aston-martin,manufacturer_audi,manufacturer_bmw,manufacturer_buick,manufacturer_cadillac,manufacturer_chevrolet,manufacturer_chrysler,manufacturer_datsun,manufacturer_dodge,...,type_coupe,type_hatchback,type_mini-van,type_offroad,type_other,type_pickup,type_sedan,type_truck,type_van,type_wagon
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [19]:
df_trans = pd.merge(manufacturer_data[['price','year','cylinders']], 
                    ohe_df, left_index=True, right_index=True)
df_trans.head(3)

Unnamed: 0,price,year,cylinders,manufacturer_alfa-romeo,manufacturer_aston-martin,manufacturer_audi,manufacturer_bmw,manufacturer_buick,manufacturer_cadillac,manufacturer_chevrolet,...,type_coupe,type_hatchback,type_mini-van,type_offroad,type_other,type_pickup,type_sedan,type_truck,type_van,type_wagon
1,4500.0,1992.0,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3000.0,2004.0,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,2100.0,2006.0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
df_trans.shape

(10650, 55)

In [21]:
sc = StandardScaler()
df_trans_ss = sc.fit_transform(df_trans)

In [22]:
sparse_pivot = sparse.csr_matrix(df_trans_ss)

In [23]:
recommeder = cosine_similarity(sparse_pivot)

In [24]:
recommender_df = pd.DataFrame(recommeder, columns=df_trans.index, index=df_trans.index)

recommender_df.head()

Unnamed: 0,1,4,5,9,22,23,24,25,27,28,...,38941,38942,38943,38946,38947,38948,38950,38956,38957,38960
1,1.0,-0.095287,-0.070704,-0.099848,-0.032939,-0.00328,-0.006924,-0.096177,-0.066774,-0.070576,...,-0.162239,-0.129902,-0.079647,-0.049797,-0.125072,-0.072717,-0.090635,-0.162043,0.438142,-0.058581
4,-0.095287,1.0,-0.143658,-0.002646,-0.003636,-0.029885,-0.082739,-0.048867,-0.022904,0.004397,...,-0.0597,-0.095905,0.060864,-0.03206,0.0489,0.896309,-0.100425,-0.06349,-0.02908,-0.151971
5,-0.070704,-0.143658,1.0,0.211019,-0.012368,-0.019195,-0.013388,-0.109565,-0.055626,0.226697,...,-0.123429,0.237637,0.108421,-0.072882,-0.113206,-0.136372,-0.20683,-0.131093,-0.093421,0.69356
9,-0.099848,-0.002646,0.211019,1.0,-0.043734,-0.024643,-0.072047,-0.02012,-0.03828,0.941608,...,-0.0999,0.879683,0.890033,0.016709,-0.080932,-0.08473,0.014593,-0.084196,-0.007998,-0.152511
22,-0.032939,-0.003636,-0.012368,-0.043734,1.0,-0.014476,-0.023265,-0.011483,0.006044,-0.013137,...,0.01058,-0.03395,-0.077554,-0.018383,0.030471,0.010234,-0.045138,0.003045,-0.051245,-0.045403


In [25]:
recommender_df.index

Index([    1,     4,     5,     9,    22,    23,    24,    25,    27,    28,
       ...
       38941, 38942, 38943, 38946, 38947, 38948, 38950, 38956, 38957, 38960],
      dtype='int64', length=10650)

In [26]:
df.iloc[29580,:]

price              2995.0
year               2004.0
manufacturer        buick
condition            good
cylinders               6
fuel                  gas
odometer         197514.0
title_status        clean
transmission    automatic
drive                 fwd
size            full-size
type                  SUV
paint_color        silver
state                  fl
Name: 29580, dtype: object

In [42]:
score = recommender_df[4].sort_values(ascending = False)[1:11]
score

29580    0.999999
2225     0.999999
2393     0.999691
38181    0.999691
29864    0.999420
36320    0.999419
34415    0.999418
22328    0.999416
1976     0.999117
6991     0.998771
Name: 4, dtype: float64

In [36]:
score.index

Index([29580, 2225, 2393, 38181, 29864, 36320, 34415, 22328, 1976, 6991], dtype='int64')

In [38]:
recommender_list =[]

In [69]:
for i in score.index:
    car_recommender = df[['price','manufacturer','type','year','condition','fuel',
                         'title_status','transmission','paint_color']].iloc[i,:]
    recommender_list.append(car_recommender)

In [43]:
pd.DataFrame(recommender_list)

Unnamed: 0,price,manufacturer,type,year,condition,fuel,title_status,transmission,paint_color
29580,2995.0,buick,SUV,2004.0,good,gas,clean,automatic,silver
2225,2995.0,kia,SUV,2004.0,excellent,gas,clean,manual,blue
2393,2900.0,chevrolet,SUV,2004.0,good,gas,clean,automatic,purple
38181,2900.0,nissan,SUV,2004.0,excellent,gas,clean,automatic,silver
29864,3000.0,chrysler,convertible,2005.0,good,gas,clean,automatic,silver
36320,2995.0,chevrolet,SUV,2005.0,excellent,gas,clean,automatic,white
34415,2999.0,honda,sedan,2003.0,good,gas,clean,automatic,white
22328,2988.0,ford,sedan,2005.0,excellent,gas,clean,automatic,white
1976,2900.0,hyundai,sedan,2005.0,good,gas,clean,automatic,blue
6991,2800.0,chrysler,convertible,2004.0,excellent,gas,lien,automatic,white


#### Thanks Jayson (TA)

1. Get the user input
2. Append the user input to the end of your original data frame
3. Run the cosine similarity on your data
4. Do the sort values on the last row of your data frame since it should be the new user input

In [66]:
input_list = [{'price':2900,'manufacturer':'nissan','type':'SUV','year':2004,'cylinders':6}]
input_data = pd.DataFrame(input_list)
data = df[['price','manufacturer','type','year','cylinders']]
data =data.loc[data['price']<5000]
data = pd.concat([input_data, data], ignore_index=True)
data.reset_index(level =0, inplace = True)
data.head()

Unnamed: 0,index,price,manufacturer,type,year,cylinders
0,0,2900.0,nissan,SUV,2004.0,6
1,1,4500.0,jeep,sedan,1992.0,6
2,2,3000.0,chrysler,mini-van,2004.0,6
3,3,2100.0,subaru,hatchback,2006.0,4
4,4,4000.0,jeep,SUV,2006.0,6


In [65]:
input_data = pd.DataFrame(input_list)
input_data

Unnamed: 0,price,manufacturer,type,year,cylinders
0,2900,nissan,SUV,2004,6


In [67]:
def cosine_recommeder(price,manufacturer,types,year, cylinders):
    input_list = [{'price':price,'manufacturer':manufacturer,'type':types,'year':year,'cylinders':cylinders}]
    input_data = pd.DataFrame(input_list)
    data = df[['price','manufacturer','type','year','cylinders']]
    data =data.loc[data['price']<5000]
    data = pd.concat([input_data, data], ignore_index=True)
    data.reset_index(level =0, inplace = True)
                     
    ohe.fit(data[['manufacturer','type']])
    ohe_data = ohe.transform(data[['manufacturer','type']])
    ohe_df = pd.DataFrame(ohe_data.todense(), columns=ohe.get_feature_names_out())
    df_trans = pd.merge(data[['price','year','cylinders']], 
                        ohe_df, left_index=True, right_index=True)
    sc = StandardScaler()
    df_trans_ss = sc.fit_transform(df_trans)
                     
    sparse_pivot = sparse.csr_matrix(df_trans_ss)
    recommeder = cosine_similarity(sparse_pivot)
    recommender_df = pd.DataFrame(recommeder, columns=df_trans.index, index=df_trans.index)
    score = recommender_df[0].sort_values(ascending = False)[1:11]
    
    recommender_list =[]
    for i in score.index:
        car_recommender = data[['price','manufacturer','type','year','cylinders']].iloc[i,:]
        recommender_list.append(car_recommender)
    
    return pd.DataFrame(recommender_list)

In [68]:
cosine_recommeder(2900,'nissan','SUV', 2004, 4)

Unnamed: 0,price,manufacturer,type,year,cylinders
10860,3000.0,nissan,SUV,2004.0,4
20084,3000.0,nissan,SUV,2004.0,4
10203,2995.0,nissan,SUV,2003.0,4
5895,3000.0,nissan,SUV,2003.0,4
32363,2800.0,nissan,SUV,2002.0,4
25463,2850.0,nissan,SUV,2007.0,4
24269,3290.0,nissan,SUV,2005.0,4
35024,3500.0,nissan,SUV,2003.0,4
19548,2195.0,nissan,SUV,2002.0,4
12262,2700.0,nissan,SUV,2010.0,4


## Conclusion

* Due to the limit of computer memory, the cosine similarity was not be able to run for this dataset. Limit the price of the data to limit the amount of data for cosine similary. 
* The cosine similarity works when limit the data to a small dataset.
* Brand recommender, Price recommender, and All recemmender can be use as a recommender system. Those recommerder systems do not provide a perfect result but those recommender systems will give a list used car base on some features that customers are looking for.