In [36]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import warnings

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler

warnings.filterwarnings("ignore")
%matplotlib inline
sns.set()


In [39]:
train_data = pd.read_csv('bigmart-sales-data/train_v9rqX0R.csv')
train_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [40]:
train_data['Item_Fat_Content'].replace(
    {'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'}, inplace=True)

train_data['Outlet_Location_Type'].replace(
    {'Tier 1':0, 'Tier 2':1, 'Tier 3':2}, inplace=True)
train_data['Outlet_Type'].replace(
    {'Supermarket Type1':0, 'Supermarket Type2':1, 
     'Supermarket Type3':2, 'Grocery Store':3}, inplace=True)

In [41]:
train_no_null = train_data[train_data.Item_Weight.isnull() == False]
train_null = train_data[train_data.Item_Weight.isnull() == True]

item_avg = train_no_null[['Item_Identifier', 'Item_Weight']].groupby(
    by='Item_Identifier', as_index=False).mean()

tmp_data = pd.merge(right=train_null.drop('Item_Weight', axis=1), left=item_avg,
                    right_on='Item_Identifier', left_on='Item_Identifier', how='inner')

train_data = pd.concat([train_no_null, tmp_data], axis=0)
train_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,0,0,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,2,1,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,0,0,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,2,3,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,2,0,994.7052


In [42]:
train_no_null = train_data[train_data.Outlet_Size.isnull() == False]
train_null = train_data[train_data.Outlet_Size.isnull() == True]

In [43]:
temp = train_null[['Outlet_Identifier', 'Outlet_Size', 'Item_Outlet_Sales']]
temp[temp.Outlet_Size.isna() == True][['Outlet_Identifier',
                                       'Item_Outlet_Sales']].groupby('Outlet_Identifier').sum()


Unnamed: 0_level_0,Item_Outlet_Sales
Outlet_Identifier,Unnamed: 1_level_1
OUT010,188340.2
OUT017,2167465.0
OUT045,2036725.0


In [44]:
train10 = train_null[train_null['Outlet_Identifier'] == 'OUT010']
train10.replace(np.nan, 'Medium', inplace=True)

train17 = train_null[train_null['Outlet_Identifier'] == 'OUT017']
train17.replace(np.nan, 'Medium', inplace=True)

train45 = train_null[train_null['Outlet_Identifier'] == 'OUT045']
train45.replace(np.nan, 'Small', inplace=True)


In [45]:
data = pd.concat([train_no_null, train10, train17, train45], axis=0)
data['Outlet_Size'].replace(
    {'Small': 0, 'Medium': 1, 'High': 2}, inplace=True)
data.head()


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,1,0,0,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,1,2,1,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,1,0,0,2097.27
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,2,2,0,994.7052
5,FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,1,2,1,556.6088


In [46]:
encoder = LabelEncoder()
data['Item_Fat_Content'] = encoder.fit_transform(data['Item_Fat_Content'])
data['Item_Type'] = encoder.fit_transform(data['Item_Type'])

In [47]:
X = data.drop(['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales'], axis=1)
y = data['Item_Outlet_Sales']

In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)


In [49]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [50]:
best_param = {'criterion': 'poisson',
              'max_depth': 6,
              'max_features': 7,
              'min_samples_leaf': 2,
              }


In [51]:
model = RandomForestRegressor(**best_param)
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)


In [52]:
mse_test = mean_squared_error(y_true=y_test, y_pred=y_test_pred)
mse_test


1084049.3785487309

In [53]:
rmse_test = np.sqrt(mse_test)
rmse_test


1041.176919907818

In [54]:
r2 = r2_score(y_true=y_test, y_pred=y_test_pred)
r2


0.6150053607224522

In [55]:
adj_r2 = 1 - (1-r2)*(len(y_test)-1) / (len(y_test)-X_test.shape[1]-1)
adj_r2

0.613370949517972

In [56]:
a =[3, 1, 2]
a.sort()
print(a)

[1, 2, 3]


def objective_classification(trial, data=X, target=y):
    train_x, test_x, train_y, test_y = train_test_split(
        data, target, test_size=.20, random_state=10)

    param = {
        'tree_method': 'gpu_hist',
        'verbosity': 3,
        'objective': "binary:logistics",
        'booster': trial.suggest_categorical('booster', ['dart', 'gbtree', 'gblinear']),
        'lambda': trial.suggest_float('lambda', 1e-4, 1),
        'alpha': trial.suggest_float('alpha', 1e-4, 1),
        'subsample': trial.suggest_float('subsample', .1, .5),
        'colsample_bytree': trial.suggest_float('colsample_bytree', .1, .5)

    }

    if param['booster'] in ['gbtree', 'dart']:
        param['gamma']: trial.suggest_float('gamma', 1e-3, 4)
        param['eta']: trial.suggest_float('eta', .001, 5)

    xgb_classification = xgboost.XGBClassifier(**param)
    xgb_classification.fit(train_x, train_y, eval_set=[(test_x, test_y)])
    pred = xgb_classification.predict(test_x)
    a
