In [18]:
import warnings
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import euclidean_distances
warnings.filterwarnings("ignore")

In [28]:
df = pd.read_csv('new.csv', index_col=0)
df = df.dropna()

In [29]:
df.head()

Unnamed: 0,Order_ID,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,City,State,Country,...,Sales,Quantity,Discount,Profit,Shipping_Cost,Order_Priority,Processing_Time_days,Returned,Income_level,Gender
13,MX-2013-VF2171518-41591,2013-11-13,2013-11-13,Same Day,VF-2171518,Vicky Freymann,Home Office,Toledo,Parana,Brazil,...,2221.8,7,0.0,622.02,810.25,Critical,0,0,UM,1.0
14,IN-2014-PF1912027-41796,2014-06-06,2014-06-08,Second Class,PF-1912027,Peter Fuller,Consumer,Mudanjiang,Heilongjiang,China,...,3701.52,12,0.0,1036.08,804.54,Critical,2,0,UM,0.0
15,ES-2015-BP1118545-42216,2015-07-31,2015-08-03,Second Class,BP-1118545,Ben Peterman,Corporate,Paris,Ile-de-France,France,...,1869.588,4,0.1,186.948,801.66,Critical,3,0,H,0.0
17,ES-2015-PJ1883564-42255,2015-09-08,2015-09-14,Standard Class,PJ-1883564,Patrick Jones,Corporate,Prato,Tuscany,Italy,...,7958.58,14,0.0,3979.08,778.32,Low,6,0,H,0.0
19,TZ-2015-RH9555129-42343,2015-12-05,2015-12-07,Second Class,RH-9555129,Ritsa Hightower,Consumer,Uvinza,Kigoma,Tanzania,...,3409.74,6,0.0,818.28,763.38,High,2,0,L,1.0


In [None]:
grouped = df['data1'].groupby(df['key1'])

In [43]:
df['Quantity'].groupby(df['Income_level']).mean()

Income_level
H     3.688479
L     2.418947
LM    3.356129
UM    3.399868
Name: Quantity, dtype: float64

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23718 entries, 13 to 51287
Data columns (total 26 columns):
Order_ID                23718 non-null object
Order_Date              23718 non-null object
Ship_Date               23718 non-null object
Ship_Mode               23718 non-null object
Customer_ID             23718 non-null object
Customer_Name           23718 non-null object
Segment                 23718 non-null object
City                    23718 non-null object
State                   23718 non-null object
Country                 23718 non-null object
Region                  23718 non-null object
Market                  23718 non-null object
Product_ID              23718 non-null object
Category                23718 non-null object
Sub-Category            23718 non-null object
Product_Name            23718 non-null object
Sales                   23718 non-null float64
Quantity                23718 non-null int64
Discount                23718 non-null float64
Profit         

In [30]:
# Product_ID
purchased = df[['Customer_ID', 'Product_ID']]
purchased['Count'] = 1
Customer_ID = list((purchased.Customer_ID.unique()))
Product_ID = list((purchased.Product_ID.unique()))

data = purchased['Count'].tolist()
row = purchased.Customer_ID.astype('category', categories=Customer_ID).cat.codes
col = purchased.Product_ID.astype('category', categories=Product_ID).cat.codes
sparse_matrix = csr_matrix((data, (row, col)), shape=(len(Customer_ID), len(Product_ID)))

In [31]:
purchased_per_customer = pd.SparseDataFrame([pd.SparseSeries(sparse_matrix[i].toarray().ravel(), fill_value=0) for i in np.arange(sparse_matrix.shape[0])], index=Customer_ID, columns=Product_ID, default_fill_value=0)

In [24]:
purchased_per_customer.head()

Unnamed: 0,FUR-CH-4530,OFF-AP-4959,OFF-AP-3575,OFF-AP-4743,OFF-AP-4967,FUR-TA-4644,FUR-CH-5774,OFF-AP-4960,TEC-PH-5268,FUR-TA-4643,...,OFF-ST-3926,TEC-PH-6091,OFF-PA-4882,TEC-PH-3181,TEC-AC-5212,OFF-BI-4831,TEC-MA-4598,FUR-FU-3849,TEC-PH-4895,TEC-PH-3808
VF-2171518,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PF-1912027,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BP-1118545,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PJ-1883564,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RH-9555129,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
# Demographic data
demog = df[['Customer_ID', 'Gender', 'Segment', 'Market', 'Income_level']]
demog['Segment'] = [s.replace('Consumer', '1').replace('Corporate', '2').replace('Home Office', '3') for s in demog.Segment]
demog['Market'] = [m.replace('USCA', '1').replace('Asia Pacific', '2').replace('Europe', '3').replace('Africa', '4').replace('LATAM', '5') for m in demog.Market]


In [61]:
demog['Income_level'] = [i.replace('H', '1').replace('UM', '3').replace('LM', '4') for i in demog.Income_level]

In [62]:
demog['Income_level']=[i.replace('L', '2')for i in demog.Income_level]

In [63]:
demog_by_customer = demog.groupby('Customer_ID').first()

In [64]:
new_df = pd.concat([demog_by_customer, purchased_per_customer], axis=1, join='inner')

In [78]:
new_df.head(5)

Unnamed: 0,Gender,Segment,Market,Income_level,FUR-CH-4530,OFF-AP-4959,OFF-AP-3575,OFF-AP-4743,OFF-AP-4967,FUR-TA-4644,...,OFF-ST-3926,TEC-PH-6091,OFF-PA-4882,TEC-PH-3181,TEC-AC-5212,OFF-BI-4831,TEC-MA-4598,FUR-FU-3849,TEC-PH-4895,TEC-PH-3808
AA-10480101,0.0,1,5,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AA-10480102,0.0,1,2,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AA-10480130,0.0,1,2,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AA-10480139,0.0,1,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AA-104801402,0.0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
cosine = pd.DataFrame(cosine_similarity(new_df))
cosine.columns = new_df.index
cosine.index = new_df.index

In [81]:
cosine.head()

Unnamed: 0,AA-10480101,AA-10480102,AA-10480130,AA-10480139,AA-104801402,AA-104801406,AA-104801408,AA-1048031,AA-1048045,AA-1048048,...,ZD-219251404,ZD-219251408,ZD-2192518,ZD-2192527,ZD-2192545,ZD-2192548,ZD-2192564,ZD-219257,ZD-2192582,ZD-2192596
AA-10480101,1.0,0.817269,0.860663,0.81763,0.416025,0.75,0.75,0.934081,0.846327,0.914138,...,0.566947,0.612372,0.972222,0.860663,0.81763,0.81763,0.81763,0.881917,0.958994,0.878275
AA-10480102,0.817269,1.0,0.935819,0.60553,0.413919,0.746203,0.746203,0.785207,0.626783,0.677003,...,0.564076,0.609272,0.817269,0.935819,0.60553,0.60553,0.60553,0.725241,0.80615,0.650444
AA-10480130,0.860663,0.935819,1.0,0.666667,0.429669,0.774597,0.774597,0.826898,0.690066,0.745356,...,0.58554,0.632456,0.860663,0.933333,0.666667,0.666667,0.666667,0.78072,0.848953,0.716115
AA-10480139,0.81763,0.60553,0.666667,1.0,0.358057,0.645497,0.645497,0.785553,0.759072,0.819892,...,0.48795,0.527046,0.81763,0.666667,0.733333,0.733333,0.733333,0.78072,0.806505,0.787726
AA-104801402,0.416025,0.413919,0.429669,0.358057,1.0,0.416025,0.416025,0.399704,0.370625,0.40032,...,0.314485,0.339683,0.416025,0.429669,0.358057,0.358057,0.358057,0.419314,0.410365,0.384615


In [69]:
similarity = cosine.iloc[0][cosine.iloc[0] < 1].sort_values(ascending=False)

In [70]:
similarity[:10]

BS-1136539     0.972433
RD-1958582     0.972222
CM-1223518     0.972222
GM-1444036     0.972222
JH-1598518     0.972222
RB-1957082     0.972222
GM-1444082     0.972222
RC-19825101    0.972222
RC-1982528     0.972222
RC-1982536     0.972222
Name: AA-10480101, dtype: float64

In [71]:
# Find top 10 users
top_10 = similarity.index[:10]

In [72]:
# Check top 1 user first
top_1 = similarity.index[0]
purchased_history = df[df.Customer_ID == top_1]['Product_ID'].values
print(top_1, purchased_history)

BS-1136539 ['OFF-FA-6185' 'OFF-BI-6381']


In [73]:
# Check what have they purchased
top_10_purchase_history = []
recommend_items = []
for user in top_10:
    purchased_history = df[df.Customer_ID == user]['Product_ID'].values
    for item in purchased_history:
        if item not in top_10_purchase_history:
            top_10_purchase_history.append(item)

purchased_history_target = df[df.Customer_ID == similarity.name]['Product_ID'].values
for item_target in purchased_history_target:
    if item_target not in top_10_purchase_history:
        recommend_items.append(item_target)
        
        

In [74]:
def recommender(user, top_n):
    user_index = list(cosine.columns).index(user)
    similarity = cosine.iloc[user_index][cosine.iloc[user_index] < 1].sort_values(ascending=False)
    top_n = similarity.index[:top_n]
    top_n_purchase_history = []
    recommend_items = []
    for user in top_n:
        purchased_history = df[df.Customer_ID == user]['Product_ID'].values
        for item in purchased_history:
            if item not in top_n_purchase_history:
                top_n_purchase_history.append(item)
    print(top_n_purchase_history)

    purchased_history_target = df[df.Customer_ID == similarity.name]['Product_ID'].values
    print(purchased_history_target)
    for item_target in top_n_purchase_history:
        if item_target not in purchased_history_target:
            recommend_items.append(item_target)
            
    return recommend_items

In [76]:
recommender('BS-1136539', 10)

['OFF-BI-6381', 'OFF-FA-6185', 'TEC-CO-3607', 'FUR-CH-5457', 'OFF-PA-4146', 'FUR-FU-4104', 'OFF-ST-5686', 'OFF-EN-3670', 'OFF-ST-6060', 'OFF-BI-3722']
['OFF-FA-6185' 'OFF-BI-6381']


['TEC-CO-3607',
 'FUR-CH-5457',
 'OFF-PA-4146',
 'FUR-FU-4104',
 'OFF-ST-5686',
 'OFF-EN-3670',
 'OFF-ST-6060',
 'OFF-BI-3722']