<a href="https://colab.research.google.com/github/tsnarendran14/My-Scripts/blob/master/Poshmark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Loading Libraries and Mounting Google Drive

In [2]:
# Load Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
np.random.seed(123)
from itertools import product
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from scipy import stats
from sklearn.model_selection import RandomizedSearchCV, KFold
from tqdm import tqdm

In [4]:
# Load files from google drive
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/My Drive/Colab Notebooks/

Mounted at /content/drive/
/content/drive/My Drive/Colab Notebooks


# 2. Reading Data

In [5]:
data = pd.read_csv('ds-take-home-dataset.csv')

In [6]:
data.head()

Unnamed: 0,id,attr1,attr2,attr3,attr4,attr5,attr6,title,sold_price
0,742122,4,27,149.0,3808.0,1.0,99.0,one teaspoon bandit distressed denim shorts (23),65.0
1,652751,4,3,89.0,1996.0,,1500.0,gucci emily mini guccissima mini red leather bag,600.0
2,228229,4,26,301.0,5194.0,,89.0,steve madden polka dot wedges 8.5 - wi06,12.0
3,645810,4,27,,6335.0,,0.0,crown & ivy navy blue floral print shorts,12.0
4,854374,4,22,302.0,3606.0,2.0,45.0,grey wide leg dress pants,22.0


# 3. Exploratory Data Analysis

In [114]:
# Descriptive Stats
data.describe().apply(lambda s: s.apply('{0:.5f}'.format))

Unnamed: 0,id,attr1,attr2,attr3,attr4,attr5,attr6,sold_price
count,1029850.0,1029850.0,1029850.0,1029850.0,1029850.0,1029850.0,1029850.0,1029850.0
mean,499963.62359,3.79279,16.11883,185.42626,3005.54674,1.21771,4572186999.40082,195.95868
std,288635.08956,0.53534,10.87543,88.86451,1689.11349,0.41269,4594749960223.79,1440.1873
min,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,250028.25,4.0,3.0,119.0,1804.0,1.0,27.0,15.0
50%,499974.5,4.0,14.0,225.0,2896.0,1.0,119.0,50.0
75%,749908.75,4.0,26.0,252.0,3852.0,1.0,600.0,279.0
max,999850.0,4.0,35.0,306.0,6640.0,3.0,4662595486164680.0,897901.0


In [115]:
# Unique Title
data.title.nunique()/len(data)

0.8057833665096858

In [9]:
# Correlation
data.drop('id', axis = 1).corr()

Unnamed: 0,attr1,attr2,attr3,attr4,attr5,attr6,sold_price
attr1,1.0,-0.052816,-0.029559,-0.044782,-0.03154,-0.001455,0.02194
attr2,-0.052816,1.0,-0.077134,0.062492,-0.010222,0.000884,-0.072571
attr3,-0.029559,-0.077134,1.0,0.010479,-0.014137,0.000964,0.018417
attr4,-0.044782,0.062492,0.010479,1.0,0.060171,0.000416,-0.02999
attr5,-0.03154,-0.010222,-0.014137,0.060171,1.0,-0.001293,-0.005238
attr6,-0.001455,0.000884,0.000964,0.000416,-0.001293,1.0,7.3e-05
sold_price,0.02194,-0.072571,0.018417,-0.02999,-0.005238,7.3e-05,1.0


In [10]:
data = data.sort_values(['id'])

In [11]:
round(len(data)*.5),round(len(data)*.25), round(len(data)*.75)

(514925, 257462, 772388)

In [12]:
# Missing Value Percentage
data.isnull().mean()

id            0.000000
attr1         0.000000
attr2         0.000000
attr3         0.181357
attr4         0.160638
attr5         0.518703
attr6         0.000000
title         0.000086
sold_price    0.000000
dtype: float64

# 4. Train Test Split and *Imputation*

In [13]:
# 75 - 25 percent Train Test Split
train = data.iloc[:772388]
test = data.iloc[772388:]

In [14]:
data.shape, train.shape, test.shape

((1029850, 9), (772388, 9), (257462, 9))

In [15]:
# Imputation using Mode
train.attr3 = train.attr3.fillna(train.attr3.mode()[0])
train.attr4 = train.attr4.fillna(train.attr4.mode()[0])
train.attr5 = train.attr5.fillna(train.attr5.mode()[0])
train.title = train.title.fillna(train.title.mode()[0])

test.attr3 = test.attr3.fillna(train.attr3.mode()[0])
test.attr4 = test.attr4.fillna(train.attr4.mode()[0])
test.attr5 = test.attr5.fillna(train.attr5.mode()[0])
test.title = test.title.fillna(train.title.mode()[0])

In [16]:
# Null Value Check Again
train.isnull().mean()

id            0.0
attr1         0.0
attr2         0.0
attr3         0.0
attr4         0.0
attr5         0.0
attr6         0.0
title         0.0
sold_price    0.0
dtype: float64

# 5. Feature Engineering

In [17]:
# String cleaning o Title column
def clean_str(string):
    """
    String cleaning.
    Original from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    
    return string.strip().lower()

In [18]:
train['title'] = [clean_str(sent) for sent in train['title']]
test['title'] = [clean_str(sent) for sent in test['title']]

In [19]:
train.head()

Unnamed: 0,id,attr1,attr2,attr3,attr4,attr5,attr6,title,sold_price
871108,1,4,9,252.0,2896.0,1.0,1000.0,mandalay dress size6,350.0
125952,2,4,2,24.0,2896.0,1.0,380.0,gucci belt,100.0
940965,3,4,3,252.0,2896.0,1.0,495.0,dior handbag,250.0
483010,4,4,3,252.0,2896.0,1.0,2300.0,collectible gucci purse by tom ford w ostrich,650.0
218425,5,4,26,252.0,1996.0,1.0,700.0,gucci hollywood heel,125.0


In [20]:
# Applying TFIDF vectorizer to title column
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,1), 
               min_df=5000, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, stop_words = {'english'} )
trn_term_doc = vec.fit_transform(train['title'])
test_term_doc = vec.transform(test['title'])

In [21]:
trn_term_doc

<772388x131 sparse matrix of type '<class 'numpy.float64'>'
	with 1691976 stored elements in Compressed Sparse Row format>

In [22]:
# Concatenating TFIDF vector with Actual Data for Train
train = pd.concat([train.reset_index(), pd.DataFrame(trn_term_doc.toarray(), columns=vec.get_feature_names()).reset_index()], axis = 1)

In [23]:
train.shape

(772388, 142)

In [24]:
# Concatenating TFIDF vector with Actual Data for Test
test = pd.concat([test.reset_index(), pd.DataFrame(test_term_doc.toarray(), columns=vec.get_feature_names()).reset_index()], axis = 1)

# 6. Feature Reduction and Model Building

In [25]:
# Basic XGBoostRegressor model with default Parameters
xgb_r = xgb.XGBRegressor(objective ='reg:squarederror', seed = 123)

In [26]:
# Fitting the model
xgb_r.fit(train.drop(['id', 'title', 'sold_price', 'index'], axis = 1), train['sold_price'])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=123, silent=None, subsample=1, verbosity=1)

In [28]:
feature_imp_df = pd.DataFrame({'Features' : train.drop(['id', 'title', 'sold_price', 'index'], axis = 1).columns, 'Importance': xgb_r.feature_importances_})

In [106]:
feature_imp_df.to_csv('Poshmark_XGB_initial_Feat_imp.csv', index = False)

In [29]:
feature_imp_df.sort_values('Importance', ascending=False)

Unnamed: 0,Features,Importance
124,tote,0.757290
24,brown,0.069540
3,attr4,0.063478
77,monogram,0.037206
109,skirt,0.015437
...,...,...
69,lululemon,0.000000
28,by,0.000000
29,cardigan,0.000000
66,long,0.000000


##### Only Non zero feature importance features will be sent for Random search cv as final features. From 137 features, final 42 non zero features were selected hyper parameter tuning

In [30]:
feature_imp_df_not_zero = feature_imp_df[feature_imp_df['Importance']!=0]

In [31]:
feature_imp_df_not_zero.shape

(42, 2)

# 7. Random Search using Time Series Cross Validation

In [32]:
# Time series fold split
tscv = TimeSeriesSplit(n_splits=2)

In [33]:
# Preparing Random Search CV
param_dist = {'n_estimators': stats.randint(150, 500),
              'learning_rate': stats.uniform(0.01, 0.07),
              'subsample': stats.uniform(0.3, 0.7),
              'max_depth': [3, 4, 5, 6, 7, 8, 9],
              'colsample_bytree': stats.uniform(0.5, 0.45),
              'min_child_weight': [1, 2, 3]
             }
reg = RandomizedSearchCV(xgb_r, param_distributions = param_dist, n_iter = 5,
                         scoring = 'neg_mean_squared_error', verbose = 3, n_jobs = -1, cv= tscv)

In [34]:
# Fitting the model
reg.fit(train[feature_imp_df_not_zero.Features.values],train['sold_price'])

Fitting 2 folds for each of 5 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 39.4min finished


RandomizedSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=2),
                   error_score=nan,
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          importance_type='gain',
                                          learning_rate=0.1, max_delta_step=0,
                                          max_depth=3, min_child_weight=1,
                                          missing=None, n_estimators=100,
                                          n_jobs=1, nthread=None,
                                          ob...
                                        'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fdfccd766d8>,
                                        'max_depth': [3, 4, 5, 6, 7, 8, 9],
       

In [108]:
# Best Estimator and Score
print('Best Estimator : ', reg.best_estimator_,) 
print('Best Score : ', reg.best_score_)

Best Estimator :  XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7393224141935897, gamma=0,
             importance_type='gain', learning_rate=0.05440806709859248,
             max_delta_step=0, max_depth=3, min_child_weight=2, missing=None,
             n_estimators=249, n_jobs=1, nthread=None,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=123, silent=None,
             subsample=0.727716457474308, verbosity=1)
Best Score :  -2930120.7269459087


In [110]:
# Cross Validation Results
pd.DataFrame(reg.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_learning_rate,param_max_depth,param_min_child_weight,param_n_estimators,param_subsample,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,695.32769,259.101377,7.312149,0.731501,0.813411,0.0300298,7,3,380,0.803405,"{'colsample_bytree': 0.8134111335190377, 'lear...",-4305101.0,-2048970.0,-3177035.0,1128065.0,3
1,268.768057,97.285842,2.510196,0.111506,0.721004,0.0646019,4,1,263,0.636652,"{'colsample_bytree': 0.7210035200446687, 'lear...",-4415297.0,-2151455.0,-3283376.0,1131921.0,4
2,365.636661,133.248824,3.673815,0.031777,0.676453,0.0340225,4,3,403,0.786329,"{'colsample_bytree': 0.6764528831873677, 'lear...",-4247620.0,-1678427.0,-2963024.0,1284596.0,2
3,673.31484,194.596195,5.702409,1.084959,0.767256,0.0542254,6,3,490,0.672086,"{'colsample_bytree': 0.7672560798533107, 'lear...",-4327427.0,-3162494.0,-3744960.0,582466.4,5
4,192.434028,66.243746,1.839349,0.023573,0.739322,0.0544081,3,2,249,0.727716,"{'colsample_bytree': 0.7393224141935897, 'lear...",-4246716.0,-1613525.0,-2930121.0,1316596.0,1


In [61]:
# The model with best hyper parameters
xgb_final = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7393224141935897, gamma=0,
              importance_type='gain', learning_rate=0.05440806709859248,
              max_delta_step=0, max_depth=3, min_child_weight=2, missing=None,
              n_estimators=249, n_jobs=1, nthread=None,
              objective='reg:squarederror', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=123, silent=None,
              subsample=0.727716457474308, verbosity=1)

In [62]:
# Retraining train data with final hyper parameters from randomsearchcv
xgb_final.fit(train[feature_imp_df_not_zero.Features.values],train['sold_price'])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7393224141935897, gamma=0,
             importance_type='gain', learning_rate=0.05440806709859248,
             max_delta_step=0, max_depth=3, min_child_weight=2, missing=None,
             n_estimators=249, n_jobs=1, nthread=None,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=123, silent=None,
             subsample=0.727716457474308, verbosity=1)

In [63]:
# Predicting the Results
train['Predictions'] = xgb_final.predict(train[feature_imp_df_not_zero.Features.values])
test['Predictions'] = xgb_final.predict(test[feature_imp_df_not_zero.Features.values])

# 8. Evaluating the Model

In [64]:
# Train Error RMSE
mean_squared_error(train['sold_price'], train['Predictions']) ** 0.5

1234.2634237290156

In [65]:
# Test Error RMSE
mean_squared_error(test['sold_price'], test['Predictions']) ** 0.5

1596.3373896674454

In [67]:
# Function for calculating error across different buckets
def bucket_error(df):
  bucket_1 = df[df['sold_price'] <= 50]
  bucket_2 = df[(df['sold_price'] > 50) & (df['sold_price'] <= 100)]
  bucket_3 = df[(df['sold_price'] > 100) & (df['sold_price'] <= 500)]
  bucket_4 = df[(df['sold_price'] > 500) & (df['sold_price'] <= 1000)]
  bucket_5 = df[df['sold_price'] > 1000]
  bucket_1_error = mean_squared_error(bucket_1['sold_price'], bucket_1['Predictions']) ** 0.5
  bucket_2_error = mean_squared_error(bucket_2['sold_price'], bucket_2['Predictions']) ** 0.5
  bucket_3_error = mean_squared_error(bucket_3['sold_price'], bucket_3['Predictions']) ** 0.5
  bucket_4_error = mean_squared_error(bucket_4['sold_price'], bucket_4['Predictions']) ** 0.5
  bucket_5_error = mean_squared_error(bucket_5['sold_price'], bucket_5['Predictions']) ** 0.5
  return pd.DataFrame({'bucket_1' : bucket_1_error, 'bucket_2' : bucket_2_error, 'bucket_3' : bucket_3_error, 'bucket_4' : bucket_4_error, 
                       'bucket_5' : bucket_5_error}, index = [0])

In [68]:
# Train Bucket Error
bucket_error(train)

Unnamed: 0,bucket_1,bucket_2,bucket_3,bucket_4,bucket_5
0,87.016211,102.347769,242.870291,500.110339,8852.742892


In [100]:
bucket_error(train).to_csv('Poshmark_train_error.csv', index = False)

In [69]:
# Test Buvket Error
bucket_error(test)

Unnamed: 0,bucket_1,bucket_2,bucket_3,bucket_4,bucket_5
0,100.870347,149.051056,188.592175,862.64956,9753.995345


In [101]:
bucket_error(test).to_csv('Poshmark_test_error.csv', index = False)

In [97]:
# Feature Contribution calculation 
data_100 = train.iloc[:100].copy()
data_100_p2 = data_100[feature_imp_df_not_zero.Features.values].sample(frac = 1).copy()
data_100_p2 = data_100_p2.reset_index()
data_100_p2.drop(['index'], axis = 1, inplace = True)
final_feature_df = pd.DataFrame()
k = 0
for i in tqdm(feature_imp_df_not_zero.Features.values):
  data_100_p1 = data_100_p2.copy()
  data_100_p1[i] = data_100[i]
  phi = 0
  for j in range(len(data_100)):    
    b1 = data_100_p1.iloc[[j]]
    b2 = data_100_p2.iloc[[j]]
    phi =  phi + xgb_final.predict(b1)[0] - xgb_final.predict(b2)[0]
  phi_avg = phi/len(data_100)
  temp_df = pd.DataFrame({'Feature' : i, 'Feature_contrib' : phi_avg}, index = [k])
  final_feature_df = final_feature_df.append(temp_df)
  k = k+ 1

100%|██████████| 42/42 [00:10<00:00,  3.83it/s]


In [112]:
# Feature contribution output
final_feature_df.sort_values('Feature_contrib', ascending = False)

Unnamed: 0,Feature,Feature_contrib
10,chanel,3.443725
9,brown,3.428717
39,vuitton,2.863134
1,attr2,2.215206
27,pants,1.909196
5,attr6,1.651646
3,attr4,0.4809468
7,bag,0.2205469
41,yurman,0.1616646
6,authentic,0.1446065


In [99]:
final_feature_df.to_csv('final_feature_contrib_df.csv', index = False)

In [103]:
final_feature_imp = pd.DataFrame({'Features': feature_imp_df_not_zero.Features.values, 'Feature_importance' : xgb_final.feature_importances_})

In [113]:
# Final SGBoost model feature Importance
final_feature_imp.sort_values('Feature_importance', ascending = False)

Unnamed: 0,Features,Feature_importance
9,brown,0.209596
25,monogram,0.139463
5,attr6,0.11028
22,mens,0.096607
20,louis,0.084933
16,heels,0.080945
37,tote,0.065852
2,attr3,0.032722
1,attr2,0.029731
39,vuitton,0.027641


In [105]:
final_feature_imp.to_csv('Poshmark_XGB_Final_feature_importance.csv', index = False)

# End