In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from skimage import io
from sklearn.decomposition import PCA

In [3]:
import os
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [4]:
transactions = pd.read_csv('/gdrive/MyDrive/HM_articles_recommendation/transactions_train.csv')
articles = pd.read_csv('/gdrive/MyDrive/HM_articles_recommendation/articles.csv')
customers = pd.read_csv('/gdrive/MyDrive/HM_articles_recommendation/customers.csv')

In [5]:
users = next(transactions)

In [6]:
df = users.merge(articles, on='article_id')

In [7]:
df.shape

(100000, 29)

In [8]:
df = df[['t_dat', 'customer_id', 'article_id', 'prod_name', 'product_type_name', 'product_group_name','graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name',
       'perceived_colour_master_name',
       'department_name', 'index_name',
       'index_group_name', 'section_name',
       'garment_group_name', 'detail_desc']]

features = ['product_group_name', 
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name',
       'perceived_colour_master_name',
       'department_name', 'index_name',
       'index_group_name', 'section_name',
       'garment_group_name']

In [9]:
df1 = df[['customer_id', 'article_id'] + features]
dummies_df = pd.get_dummies(df1, columns=features)
dummies_df.head()

Unnamed: 0,customer_id,article_id,product_group_name_Accessories,product_group_name_Bags,product_group_name_Cosmetic,product_group_name_Garment Full body,product_group_name_Garment Lower body,product_group_name_Garment Upper body,product_group_name_Items,product_group_name_Nightwear,...,garment_group_name_Shorts,garment_group_name_Skirts,garment_group_name_Socks and Tights,garment_group_name_Special Offers,garment_group_name_Swimwear,garment_group_name_Trousers,garment_group_name_Trousers Denim,"garment_group_name_Under-, Nightwear",garment_group_name_Unknown,garment_group_name_Woven/Jersey/Knitted mix Baby
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,3681748607f3287d2c3a65e00bb5fb153de30e9becf158...,663713001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,4ef5967ff17bf474bffebe5b16bd54878e1d4105f7b4ed...,663713001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,6b7b10d2d47516c82a6f97332478dab748070f09693f09...,663713001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,8ac137752bbe914aa4ae6ad007a9a0c5b67a1ab2b2d474...,663713001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [10]:
minimum_items = 0
customer_groups = dummies_df.groupby('customer_id')

l = []
cutomer_ids = []
article_ids = []
for key in customer_groups.groups.keys():
    k = customer_groups.get_group(key)
    if k.article_id.nunique() >= minimum_items:
        l.append(k.drop('article_id', axis=1).sum(numeric_only=True).values)
        cutomer_ids.append(key)
        article_ids.extend(k.article_id.values.tolist())
    else:
      continue

In [11]:
user_feature = pd.DataFrame(l, columns = dummies_df.columns[2:])
normalized_user_feature = user_feature.div(user_feature.sum(axis=1), axis=0)
normalized_user_feature.insert(0, 'customer_id', cutomer_ids)
normalized_user_feature = normalized_user_feature.set_index('customer_id')
normalized_user_feature.head()

Unnamed: 0_level_0,product_group_name_Accessories,product_group_name_Bags,product_group_name_Cosmetic,product_group_name_Garment Full body,product_group_name_Garment Lower body,product_group_name_Garment Upper body,product_group_name_Items,product_group_name_Nightwear,product_group_name_Shoes,product_group_name_Socks & Tights,...,garment_group_name_Shorts,garment_group_name_Skirts,garment_group_name_Socks and Tights,garment_group_name_Special Offers,garment_group_name_Swimwear,garment_group_name_Trousers,garment_group_name_Trousers Denim,"garment_group_name_Under-, Nightwear",garment_group_name_Unknown,garment_group_name_Woven/Jersey/Knitted mix Baby
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,0.0,0.0,0.0,0.033333,0.033333,0.033333,0.0,0.0,0.0,0.0,...,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0
00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.0
0003abe64294e66a6310c3436fa9e5b754cc5603deef4f26fc8ab8d043af9358,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025,0.0,0.0
0004068f54dbe1c7054b23c615edc5f733a508ecc54930bf323209f20410898c,0.0,0.0,0.0,0.05,0.0,0.05,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
item_feature = dummies_df.drop_duplicates(subset='article_id')
item_feature = item_feature[item_feature.article_id.isin(article_ids)].drop('customer_id', axis=1)
item_feature = item_feature.set_index('article_id')
item_feature.head()

Unnamed: 0_level_0,product_group_name_Accessories,product_group_name_Bags,product_group_name_Cosmetic,product_group_name_Garment Full body,product_group_name_Garment Lower body,product_group_name_Garment Upper body,product_group_name_Items,product_group_name_Nightwear,product_group_name_Shoes,product_group_name_Socks & Tights,...,garment_group_name_Shorts,garment_group_name_Skirts,garment_group_name_Socks and Tights,garment_group_name_Special Offers,garment_group_name_Swimwear,garment_group_name_Trousers,garment_group_name_Trousers Denim,"garment_group_name_Under-, Nightwear",garment_group_name_Unknown,garment_group_name_Woven/Jersey/Knitted mix Baby
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
663713001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
541518023,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
505221004,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
685687003,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
685687004,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [13]:
scores = normalized_user_feature.dot(item_feature.T)
scores.head()

article_id,663713001,541518023,505221004,685687003,685687004,685687001,505221001,688873012,501323011,598859003,...,661929001,466381012,573830001,714429001,641228003,661351001,637028001,623873001,548837002,538977001
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,0.166667,0.1,0.233333,0.166667,0.2,0.2,0.233333,0.2,0.233333,0.166667,...,0.233333,0.133333,0.1,0.2,0.366667,0.233333,0.233333,0.033333,0.166667,0.1
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,0.8,0.8,0.05,0.15,0.15,0.15,0.0,0.15,0.35,0.25,...,0.0,0.35,0.1,0.1,0.15,0.05,0.0,0.15,0.05,0.05
00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,0.1,0.1,0.52,0.6,0.58,0.58,0.48,0.22,0.16,0.16,...,0.18,0.04,0.14,0.1,0.08,0.1,0.18,0.04,0.08,0.26
0003abe64294e66a6310c3436fa9e5b754cc5603deef4f26fc8ab8d043af9358,0.275,0.125,0.325,0.15,0.3,0.25,0.325,0.15,0.275,0.225,...,0.325,0.225,0.175,0.225,0.3,0.15,0.325,0.075,0.15,0.175
0004068f54dbe1c7054b23c615edc5f733a508ecc54930bf323209f20410898c,0.15,0.2,0.05,0.2,0.2,0.2,0.05,0.3,0.2,0.2,...,0.1,0.35,0.2,0.15,0.25,0.15,0.2,0.15,0.1,0.1


In [14]:
pca = PCA(n_components=100)
pca.fit(normalized_user_feature)
pca.explained_variance_ratio_.sum()

0.9525069292873788

In [15]:
user_feature_pca = pd.DataFrame(pca.transform(normalized_user_feature), 
                                columns=['component_{}'.format(i) for i in range(1, 101)]).set_index(normalized_user_feature.index)
item_feature_pca = pd.DataFrame(pca.transform(item_feature), 
                                columns=['component_{}'.format(i) for i in range(1, 101)]).set_index(item_feature.index)

In [16]:
scores_pca = user_feature_pca.dot(item_feature_pca.T)

In [17]:
def get_recommend(customer_id, scores):
    cutomer_scores = scores.loc[customer_id]
    customer_prev_items = customer_groups.get_group(customer_id)['article_id']
    prev_dropped = cutomer_scores.drop(customer_prev_items.values)
    ordered = prev_dropped.sort_values(ascending=False)   
    return ordered, customer_prev_items

In [18]:
k = 12
customer_id = scores.index

#rcmnds, prev_items = get_recommend(customer_id, scores)
#rcmnds_pca, prev_items = get_recommend(customer_id, scores_pca)
#rcmnds = rcmnds.index.values[:k]
#rcmnds_pca = rcmnds_pca.index.values[:k]

In [19]:
res = pd.DataFrame(columns=['customer_id', 'prediction'])
res_pca = pd.DataFrame(columns=['customer_id', 'prediction'])
res_li = []
res_li_pca = []
for idx in customer_id:
  rcmnds, prev_items = get_recommend(idx, scores)
  rcmnds_pca, prev_items = get_recommend(idx, scores_pca)
  rcmnds = rcmnds.index.values[:k]
  rcmnds_pca = rcmnds_pca.index.values[:k]
  res_li.append(rcmnds)
  res_li_pca.append(rcmnds_pca)


This is the output

In [20]:
res['customer_id'] = customer_id
res['prediction'] = res_li
res_pca['customer_id'] = customer_id
res_pca['prediction'] = res_li_pca

In [28]:
from google.colab import files

res.to_csv('submission.csv', encoding = 'utf-8-sig')
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>