In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Importing the Datasets

In [None]:
train = pd.read_csv('../input/final-capstone/train.csv', index_col = 'id')
test = pd.read_csv('../input/final-capstone/test.csv', index_col = 'id')

In [None]:
train.head()

## EDA

In [None]:
train.shape, test.shape

In [None]:
train.describe()

In [None]:
train.info()

#### Missing Values

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

In [None]:
plt.figure(figsize=(18,7))
sns.heatmap(train.isnull(), yticklabels = False)
plt.show()

In [None]:
train.Rating.hist(bins = 15)
plt.show()

In [None]:
sns.countplot(x = train.platform, hue = train.maincateg, palette = 'Set2')
plt.show

In [None]:
train.groupby('maincateg').maincateg.count().plot(kind = 'pie')

In [None]:
plt.figure(figsize=(15,9))
sns.heatmap(train.corr(), annot=True, cmap='mako')
plt.show()

## Imputing missing values 

manually importing missing 'maincateg' values from title 

In [None]:
non_cat1 = train[train.maincateg.isna()]
non_cat2 = test[test.maincateg.isna()]
#non_cat['maincateg'][5575]
n_val1 = non_cat1.copy()
n_val2 = non_cat2.copy()
n_val2#.loc[4262, 'price1']

In [None]:
a = train.title.str.findall('Men|Women')
b = test.title.str.findall('Men|Women')
#b = train.title.str.find('Women')

In [None]:
a.value_counts() #women -> more frequent customer

In [None]:
for x in n_val1.index:
    if a[x]==['Men']:
        n_val1.loc[x,'maincateg'] = 'Men'
    else:
        n_val1.loc[x,'maincateg'] = 'Women'

In [None]:
for x in n_val2.index:
    if b[x]==['Men']:
        n_val2.loc[x,'maincateg'] = 'Men'
    else:
        n_val2.loc[x,'maincateg'] = 'Women'

In [None]:
n_val1

In [None]:
n_val1 = n_val1['maincateg']
n_val2 = n_val2['maincateg']

In [None]:
train.head()

In [None]:
#train1.loc[(train1.maincateg == np.nan) , 'maincateg'] = n_val.maincateg
#train1['maincateg'].replace(np.nan , n_val['maincateg'],inplace=True)
#train1.loc[train1["maincateg"] == 0, :] = n_val
#train1['maincateg'].mask(train1['maincateg'] == np.nan, n_val['maincateg'], inplace=True)
#frame = [train1,n_val]
#new_train = pd.concat(frame, axis = 0)

In [None]:
for x in n_val1.index:
    train.loc[x,'maincateg'] = n_val1[x]

In [None]:
for x in n_val2.index:
    test.loc[x,'maincateg'] = n_val2[x]

In [None]:
train.head(20)
#train1.isna().sum()

In [None]:
test.head()

In [None]:
test.isna().sum()

In [None]:
train.drop(columns = ['title', 'Offer %'], inplace=True, axis = 1)
test.drop('title', inplace=True, axis = 1)

In [None]:
price1 = train.price1

## Capping outlier values

In [None]:
train.isna().sum()

In [None]:
plt.figure(figsize=(20,7))
sns.set_theme(style="whitegrid")
sns.boxplot(data=train)

plt.show()

In [None]:
plt.figure(figsize=(20,7))
sns.boxplot(data=test)

plt.show()

In [None]:
sns.boxplot(x = test.star_3f)
plt.show()

In [None]:
plt.figure(figsize=(20,7))
sns.set_theme(style="whitegrid")
sns.scatterplot(data=train, x = 'price1', y = 'norating1', hue = 'star_5f')
plt.show()

In [None]:
plt.figure(figsize=(20,7))
sns.set_theme(style="whitegrid")
sns.scatterplot(data=train, x = 'price1', y = 'star_3f')#, hue = 'star_5f')

In [None]:
sns.scatterplot(data=train, x = 'price1', y = 'noreviews1')

In [None]:
outliers=((train['norating1']>100000) | 
          (train['star_5f']>140000) |
          (train['noreviews1']>40000) |
          (train['star_4f']>60000) |
          (train['star_3f']>30000)
         )
outliers1=((test['norating1']>100000) | 
          (test['star_5f']>140000) |
          (test['noreviews1']>40000) |
          (test['star_4f']>60000) |
          (test['star_3f']>30000)
         )

In [None]:
outs1 = train[outliers]
outs2 = test[outliers1]
outs1.head()

In [None]:
coll = ['norating1','noreviews1','star_5f','star_4f','star_3f','star_2f','star_1f']
outs1 = outs1[coll]/4
outs2 = outs2[coll]/4

In [None]:
outs1.head()

In [None]:
#outs1.describe()

In [None]:
#outs1.median()

In [None]:
train.loc[outliers,'norating1':'star_1f'] = outs1
test.loc[outliers1,'norating1':'star_1f'] = outs2

In [None]:
train.loc[14019,:]

In [None]:
plt.figure(figsize=(20,7))
sns.set_theme(style="whitegrid")
sns.boxplot(data=test)
#sns.stripplot(data=train)
plt.show()

## Model

In [None]:
x_train = train.drop('price1', axis = 1)
y_train = train['price1']

In [None]:
#x_train.maincateg.value_counts()

In [None]:
x_train

In [None]:
test

#### One hot encoding

In [None]:
ohe = ['maincateg','platform']

In [None]:
x_train = pd.get_dummies(x_train, columns=ohe)
test = pd.get_dummies(test, columns=ohe)

#### Imputation of numerical cols

In [None]:
imp = SimpleImputer(strategy = 'median')

In [None]:
test.isna().sum()

In [None]:
x_train = pd.DataFrame(imp.fit_transform(x_train), columns = x_train.columns, index = x_train.index)
test = pd.DataFrame(imp.transform(test), columns = test.columns, index = test.index)

In [None]:
x_train.isnull().sum()

#### Standard Scaler

In [None]:
scaler = StandardScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train), columns = x_train.columns, index = x_train.index)
test = pd.DataFrame(scaler.transform(test), columns = test.columns, index = test.index)

In [None]:
#numerical_transformer = SimpleImputer(strategy = 'median')
#categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore',sparse = False))])
#categorical_transformer = OneHotEncoder(handle_unknown='ignore',sparse = False)

#preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols)])#, ('cat', categorical_transformer, categorical_cols)])

In [None]:
#rf = RandomForestRegressor(random_state = 42)
#pipe = Pipeline(steps = [('preprocessor', preprocessor),('rf', rf)])
#cv_score = cross_val_score(pipe,x_train,y_train,cv =5)
#cv_score.mean()
#pipe = Pipeline(steps = [('preprocessor', preprocessor),('rf', RandomForestRegressor(random_state = 42,n_estimators = 300, bootstrap = False,max_features='sqrt', warm_start = True))])

#### Flaml to find the best model

In [None]:
!pip install flaml
from flaml import AutoML
from flaml.default import RandomForestRegressor, LGBMRegressor, ExtraTreesRegressor

In [None]:
#for training the best model and its parameters
#automl = AutoML(task = 'regression', time_budget = 900, estimator_list = ['extra_tree'])
#automl.fit(x_train,y_train)

In [None]:
et = ExtraTreesRegressor(max_features=0.5968495538899103, max_leaf_nodes=7864,
                    n_estimators=53, n_jobs=-1)
et.fit(x_train,y_train)

In [None]:
#best score yet
et3 = ExtraTreesRegressor(max_features=0.47519952135022675, max_leaf_nodes=7864,
                    n_estimators=429, n_jobs=-1)
et3.fit(x_train,y_train)

In [None]:
preds_train = et3.predict(x_train)

In [None]:
preds = et3.predict(test)

In [None]:
rms = mean_squared_error(price1, preds_train, squared=False)
rms

In [None]:
print('MAE Extra Trees:', mean_absolute_error(price1, preds_train))

In [None]:
cv_score = cross_val_score(et3,x_train,y_train,cv =5)
cv_score.mean()

### Submission

In [None]:
output = pd.DataFrame({'id': test.index,'price1': preds})
output.to_csv('sub.csv', index=False)

### Feature Importance

In [None]:
#sorted(zip(et3.feature_importances_, x_train.columns), reverse=True)
feature_imp = pd.DataFrame(sorted(zip(et3.feature_importances_,x_train.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('ET Features (avg over folds)')
plt.tight_layout()
plt.show()