In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from loo_encoder.encoder import LeaveOneOutEncoder

from nose.tools import *
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import make_scorer
from scipy import stats
import seaborn as sns

  from numpy.core.umath_tests import inner1d


In [2]:
df = pd.read_pickle('../data/Fendi_refined.pkl')

In [3]:
df = df[-df['bags_price_refined'].isnull()]

In [4]:
df = df[df['lifetime'] < 1000]

In [5]:
def label_class(x):
    if x <= 7:
        return 0
    else:
        return 1

In [6]:
df['label'] = df['lifetime'].map(label_class)

In [7]:
df = df[df['size'] != 'Other']

# Encoding
readme: screencapture-towardsdatascience-all-about-categorical-variable-encoding-305f3361fd02-2020-02-23-23_08_01

In [8]:
cols = list(set(df.columns) - {'bags_brand', 'sc_date', 'sold_price_refined', 'id', 'sc_date_last_date', 'sc_date_first_date',
                  'sc_date_last_date', 'bags_condition', 'lifetime'})
categorical_cols = ['materials_list', 'bags_color', 'size']
print(cols)

['label', 'likes_last_1', 'likes_last_7', 'bags_color', 'materials_list', 'retail_price_refined', 'days_live', 'avg_similar', 'condition', 'number_similar', 'original_to_avg', 'bags_price_refined', 'size', 'likes']


In [9]:
X_train, X_test, y_train, y_test = train_test_split(df[cols], df['label'], test_size=0.3, random_state=42)

In [10]:
y_test.shape

(229,)

In [11]:
enc = LeaveOneOutEncoder(cols=categorical_cols, handle_unknown='value', sigma=0.02, random_state=42)

In [12]:
y_train.shape

(532,)

In [13]:
X_train_new = enc.fit_transform(X=X_train, y=y_train, handle_missing='value')

In [14]:
X_train_new.shape

(532, 20)

In [15]:
X_test_new = enc.transform(X=X_test)

In [16]:
X_test_new.shape

(229, 20)

In [17]:
cols = ['avg_similar', 'number_similar', 'condition', 
       'original_to_avg', 'likes', 'days_live', 'likes_last_7',
       'likes_last_1',
       'bags_price_refined',
       'retail_price_refined', 'loo_materials_list',
       'loo_bags_color', 'loo_size']

In [18]:
X_test_new_1 = X_test_new[cols]
X_train_new_1 = X_train_new[cols]

In [19]:
X_test_new_1.isnull().sum()

avg_similar               0
number_similar            0
condition                 0
original_to_avg           0
likes                     0
days_live                 0
likes_last_7              0
likes_last_1              0
bags_price_refined        0
retail_price_refined      0
loo_materials_list        5
loo_bags_color          124
loo_size                  0
dtype: int64

In [18]:
X_train_new_1.fillna(value = X_train_new_1.mean(), inplace = True)
X_test_new_1.fillna(value = X_train_new_1.mean(), inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [26]:
# scaling features

min_max_scaler = MinMaxScaler()
X_train_standard = min_max_scaler.fit_transform(X_train_new_1)

In [35]:
linear_model = LinearRegression()
linear_model.fit(X_train_new_1, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [36]:
print('R_square score on the training: %.6f' % linear_model.score(X_test_new_1, y_test))

R_square score on the training: -0.272405


# Decision Tree Regression

In [34]:
decision_tree_model = DecisionTreeRegressor()
decision_tree_fit = decision_tree_model.fit(X_train_new_1, y_train)

decision_tree_score = cross_val_score(decision_tree_fit, X_train_new_1, y_train, cv = 5)
print("mean cross validation score: %.2f"  % np.mean(decision_tree_score))
print("score without cv: %.2f" % decision_tree_fit.score(X_test_new_1, y_test))


mean cross validation score: -1.27
score without cv: -1.00


In [31]:
decision_tree_score

array([-1.1950864 , -1.2051273 , -1.34867624, -0.98147196, -1.37108196])

In [15]:
X = df[['bags_price_refined', 'likes', 'condition', 
      'days_live', 'likes_last_1', 'likes_last_7', 'number_similar', 'original_to_avg']]
Y = df['lifetime']

In [16]:
linear_model = LinearRegression()
linear_model.fit(X, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [18]:
linear_model.score(X, Y)

0.2624370545974801

In [21]:
import xgboost as xgb

In [23]:
model = xgb.XGBClassifier(max_depth=5, objective='multi:softprob', n_estimators=1000, 
                        num_classes=3)
model.fit(X_train_new_1, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
       n_estimators=1000, n_jobs=1, nthread=None, num_classes=3,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [25]:
# make predictions for test data
y_pred = model.predict(X_test_new_1)
predictions = [round(value) for value in y_pred]

  if diff:


In [30]:
# evaluate predictions
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 44.98%
