In [1]:
import pandas as pd
import numpy as np

# 1.0 Load data

In [2]:
df_trans = pd.read_csv('../data/transactions_train.csv', dtype={'article_id':'string'})
df_cust = pd.read_csv('../data/customers.csv')
df_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371980 entries, 0 to 1371979
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   customer_id             1371980 non-null  object 
 1   FN                      476930 non-null   float64
 2   Active                  464404 non-null   float64
 3   club_member_status      1365918 non-null  object 
 4   fashion_news_frequency  1355971 non-null  object 
 5   age                     1356119 non-null  float64
 6   postal_code             1371980 non-null  object 
dtypes: float64(3), object(4)
memory usage: 73.3+ MB


# 2.0 Generate age groups and assign to transactions

In [3]:
# Generate age groups and assign to customers:
bins= [0, 20, 25 , 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 100]
labels = ['0-19','20-24', '25-29','30-34', '35-39','40-44', '45-49','50-54', '55-59','60-64', '65-69','70-74', '>75']
df_cust['agegroup'] = pd.cut(df_cust['age'], bins=bins, labels=labels, right=False)
df_cust['agegroup'] = df_cust['agegroup'].cat.add_categories('unknown').fillna('unknown')
df_cust.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,agegroup
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...,45-49
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...,25-29
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...,20-24
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...,50-54
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...,50-54


In [4]:
# Assign age groups to transactions based on customer_id:
df_age = df_cust.drop(columns=['FN', 'Active', 'club_member_status',
       'fashion_news_frequency', 'age', 'postal_code'])
df_trans_age = pd.merge(df_trans, df_age, on="customer_id")
df_trans_age.drop(columns=['t_dat', 'customer_id', 'price', 'sales_channel_id'], inplace=True)
df_trans_age.head()

Unnamed: 0,article_id,agegroup
0,663713001,20-24
1,541518023,20-24
2,663713001,20-24
3,578020002,20-24
4,723529001,20-24


# 4.0 Calculate top 12 per age group

In [5]:
# Calculate article_id counts per age group via crosstab:
sales_per_agegroup = pd.crosstab(df_trans_age.article_id, df_trans_age.agegroup).reset_index()
sales_per_agegroup.head()

agegroup,article_id,0-19,20-24,25-29,30-34,35-39,40-44,45-49,50-54,55-59,60-64,65-69,70-74,>75,unknown
0,108775015,95,1877,3120,1837,693,516,832,1017,524,166,51,26,4,83
1,108775044,72,1151,1852,1204,423,408,603,759,437,186,72,37,11,35
2,108775051,2,32,61,41,13,13,13,15,14,7,4,0,0,0
3,110065001,14,128,205,138,77,90,170,120,65,23,8,2,1,3
4,110065002,7,75,75,62,31,58,86,82,41,12,5,0,1,4


In [6]:
# Loop for generating top 12 article list per age group:
groups = list(sales_per_agegroup.columns)
groups.remove('article_id')
df_top12 = pd.DataFrame(columns = ['agegroup', 'articles'])
for element in groups:
    sales_ordered = sales_per_agegroup.sort_values(element,ascending=False).iloc[:12]
    article_list = list(sales_ordered.article_id)
    # append rows to an empty DataFrame
    df_top12 = df_top12.append({'agegroup' : element, 'articles' : article_list}, ignore_index = True)

print(df_top12)

  df_top12 = df_top12.append({'agegroup' : element, 'articles' : article_list}, ignore_index = True)
  df_top12 = df_top12.append({'agegroup' : element, 'articles' : article_list}, ignore_index = True)
  df_top12 = df_top12.append({'agegroup' : element, 'articles' : article_list}, ignore_index = True)
  df_top12 = df_top12.append({'agegroup' : element, 'articles' : article_list}, ignore_index = True)
  df_top12 = df_top12.append({'agegroup' : element, 'articles' : article_list}, ignore_index = True)
  df_top12 = df_top12.append({'agegroup' : element, 'articles' : article_list}, ignore_index = True)
  df_top12 = df_top12.append({'agegroup' : element, 'articles' : article_list}, ignore_index = True)
  df_top12 = df_top12.append({'agegroup' : element, 'articles' : article_list}, ignore_index = True)
  df_top12 = df_top12.append({'agegroup' : element, 'articles' : article_list}, ignore_index = True)
  df_top12 = df_top12.append({'agegroup' : element, 'articles' : article_list}, ignore_inde

   agegroup                                           articles
0      0-19  [0706016001, 0759871002, 0706016002, 046429700...
1     20-24  [0706016001, 0706016002, 0759871002, 037286000...
2     25-29  [0706016001, 0372860001, 0610776002, 070601600...
3     30-34  [0706016001, 0610776002, 0464297007, 037286000...
4     35-39  [0610776002, 0562245001, 0706016001, 046429700...
5     40-44  [0706016001, 0610776002, 0372860001, 070601600...
6     45-49  [0706016001, 0706016002, 0372860001, 037286000...
7     50-54  [0706016001, 0706016002, 0372860001, 037286000...
8     55-59  [0706016001, 0678942001, 0579541001, 061077600...
9     60-64  [0579541001, 0678942001, 0399256005, 061077600...
10    65-69  [0579541001, 0399256005, 0399256001, 061077600...
11    70-74  [0399256005, 0579541001, 0399256001, 061077600...
12      >75  [0399256005, 0399256001, 0610776001, 069093600...
13  unknown  [0706016001, 0706016002, 0372860001, 061077600...


# 3.0 Generate submission csv

In [7]:
# Assign top 12 articles to each customer based on customers age group:
df_submssion = pd.merge(df_age, df_top12, on='agegroup')
df_submssion.head()

Unnamed: 0,customer_id,agegroup,articles
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,45-49,"[0706016001, 0706016002, 0372860001, 037286000..."
1,00009c2aeae8761f738e4f937d9be6b49861a66339c2b1...,45-49,"[0706016001, 0706016002, 0372860001, 037286000..."
2,0000b95f630aaa9313028ce9c41154bb95ac7afa34f55b...,45-49,"[0706016001, 0706016002, 0372860001, 037286000..."
3,0000c97821eb48d0e590fd309133f0a6c08f7750f64ccc...,45-49,"[0706016001, 0706016002, 0372860001, 037286000..."
4,0000f2ea26b7f0a9175f428c8cf7743e9e10e193465ecd...,45-49,"[0706016001, 0706016002, 0372860001, 037286000..."


In [11]:
# Transform prediction list to string and drop not necessary columns:
df_submssion['prediction'] = df_submssion.articles.apply(lambda x: ' '.join(x))
df_submssion.drop(columns=['agegroup', 'articles'], inplace=True)
df_submssion.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0706016002 0372860001 0372860002 05...
1,00009c2aeae8761f738e4f937d9be6b49861a66339c2b1...,0706016001 0706016002 0372860001 0372860002 05...
2,0000b95f630aaa9313028ce9c41154bb95ac7afa34f55b...,0706016001 0706016002 0372860001 0372860002 05...
3,0000c97821eb48d0e590fd309133f0a6c08f7750f64ccc...,0706016001 0706016002 0372860001 0372860002 05...
4,0000f2ea26b7f0a9175f428c8cf7743e9e10e193465ecd...,0706016001 0706016002 0372860001 0372860002 05...


In [13]:
# Store top12 per agegroup as submission csv:
df_submssion.to_csv('../data/top12-per-agegroup.csv', index = False)