In [1266]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

import warnings
warnings.filterwarnings(action='ignore')

In [1764]:
def cross_val(model, X, y):
    rskfold = RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=42)
    scores = cross_val_score(model, X, y, cv=rskfold, scoring='accuracy')
    print('CV score: {:.2%} (+/- {:.2})'.format(scores.mean(), scores.std()))

In [225]:
path = 'data/'

In [None]:
!mkdir subms 

In [226]:
!ls data/

test_Yix80N0.csv  train_8wry4cB.csv


In [1796]:
train = pd.read_csv(f'{path}train_8wry4cB.csv')
test = pd.read_csv(f'{path}test_Yix80N0.csv')
train.head()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male
2,u19037,01/12/14 15:58,01/12/14 15:58,A00002/B00001/C00020/D16944/,female
3,u14556,23/11/14 2:57,23/11/14 3:00,A00002/B00004/C00018/D10284/;A00002/B00004/C00...,female
4,u24295,17/12/14 16:44,17/12/14 16:46,A00001/B00001/C00012/D30805/;A00001/B00001/C00...,male


In [310]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10500 entries, 0 to 10499
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   session_id   10500 non-null  object
 1   startTime    10500 non-null  object
 2   endTime      10500 non-null  object
 3   ProductList  10500 non-null  object
 4   gender       10500 non-null  object
dtypes: object(5)
memory usage: 410.3+ KB


In [311]:
train.gender.value_counts()

female    8192
male      2308
Name: gender, dtype: int64

In [312]:
train['startTime'] = pd.to_datetime(train['startTime'])
train['endTime']= pd.to_datetime(train['endTime'])

train['gender'] = train.gender.astype('category')
mapper = dict(enumerate(m1_train.gender.cat.categories))

In [313]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10500 entries, 0 to 10499
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   session_id   10500 non-null  object        
 1   startTime    10500 non-null  datetime64[ns]
 2   endTime      10500 non-null  datetime64[ns]
 3   ProductList  10500 non-null  object        
 4   gender       10500 non-null  category      
dtypes: category(1), datetime64[ns](2), object(2)
memory usage: 338.6+ KB


In [1736]:
m2_train = train.copy()
count_vect1 = CountVectorizer(lowercase=False, binary=True)
count_vect1.fit(m2_train['ProductList'])

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=False, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [1737]:
doc_array = count_vect1.transform(m2_train['ProductList']).toarray()
frequency_matrix = pd.DataFrame(doc_array, columns = count_vect1.get_feature_names())

In [1738]:
m2_train['unix_starttime'] = m2_train['startTime'].astype(np.int64) // 10**9
m2_train['unix_endTime'] = m2_train['endTime'].astype(np.int64) // 10**9
m2_train['duration_hrs'] = (m2_train['unix_endTime'] - m2_train['unix_starttime']) / 60

In [1739]:
m2_train = pd.concat([m2_train, frequency_matrix], axis=1)

In [1740]:
m2_train['Prod_count'] = m2_train['ProductList'].apply(lambda x: len(x.split(';')))

In [1756]:
prods = m2_train[m2_train.columns[m2_train.columns.str.contains('D')]]
cols_to_drop = list(m2_train.columns[:5]) + list(prods.columns)

## Train-Test-Split

In [1757]:
features = m2_train.drop(cols_to_drop, axis=1)
target = m2_train.gender.cat.codes

In [1758]:
X_train, X_valid, y_train, y_valid = train_test_split(features, target, stratify=target, shuffle=True ,random_state=42)

## Models

In [1769]:
lr_binary = LogisticRegression(solver='liblinear', penalty='l1', n_jobs=-1)
lr_binary.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [1760]:
print('Train score is {:.4%} \nTest score is {:.4%}'.format(lr_binary.score(X_train, y_train), lr_binary.score(X_valid, y_valid)))

Train score is 87.7587% 
Test score is 87.7714%


In [1767]:
scores = cross_val(lr_binary, features, target)

CV score: 87.31% (+/- 0.0062)


## Test Data

In [1743]:
m1_test = test.copy()

In [1744]:
doc_array = count_vect.transform(m1_test['ProductList']).toarray()
frequency_matrix = pd.DataFrame(doc_array, columns = count_vect.get_feature_names())

In [1745]:
m1_test = pd.concat([m1_test, frequency_matrix], axis=1)
m1_test['Prod_count'] = m1_test['ProductList'].apply(lambda x: len(x.split(';')))

In [1746]:
m1_test['startTime'] = pd.to_datetime(m1_test['startTime'])
m1_test['endTime']= pd.to_datetime(m1_test['endTime'])

In [1747]:
m1_test['unix_starttime'] = m1_test['startTime'].astype(np.int64) // 10**9
m1_test['unix_endTime'] = m1_test['endTime'].astype(np.int64) // 10**9
m1_test['duration_hrs'] = (m1_test['unix_endTime'] - m1_test['unix_starttime']) / 60

In [1748]:
m1_test = m1_test.drop(prods, axis=1)

In [1749]:
test_cols = m1_test[m1_test.columns[4:]].columns

In [1768]:
preds = lr_binary.predict(m1_test[test_cols])
submissions = test[['session_id']]
submissions['gender'] = preds
submissions['gender'] = submissions['gender'].map(mapper)

In [1762]:
submissions.to_csv('subms/lr-liblinear.csv', index=False)

## Should have tried

In [1731]:
# def max_count_product(List):
#     return max(set(List), key=List.count)

# def product_group(x):
#     product_cat = [i.split('/')[-2][:4] for i in x.split(';')]
#     return max_count_product(product_cat)

In [1733]:
# m2_train['ProductList'].apply(lambda x: product_group(x))

In [1795]:
%run alternate-approach.py

CV score: 90.95% (+/- 0.0042)
Accuracy score is 91.815%
