In [1]:
import xgboost as xgb
from xgboost import XGBRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [2]:
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

In [3]:
import pandas as pd
import numpy as np

#Подготовка Данных

In [4]:
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25062 entries, 0 to 25061
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Name              25062 non-null  object 
 1   Category          25062 non-null  object 
 2   Brand             25061 non-null  object 
 3   Seller            25031 non-null  object 
 4   Color             127 non-null    object 
 5   Comments          25062 non-null  float64
 6   Final price       25062 non-null  float64
 7   Max price         25062 non-null  float64
 8   Min price         25062 non-null  float64
 9   Average price     19264 non-null  float64
 10  Sales             25062 non-null  float64
 11  Days in stock     25062 non-null  float64
 12  Days with sales   25062 non-null  int64  
 13  Rating            25062 non-null  object 
 14  Basic Sale        25062 non-null  float64
 15  Basic Sale Price  25062 non-null  float64
 16  Base price        25062 non-null  float6

Добавим среднюю цену среди категории

In [6]:
avg_price_per_category = []
for cat in np.unique(train_data['Category'].values):
    # print(cat)
    avg_price_per_category.append(train_data[train_data['Category'] == cat]['Average price'].mean())
    cat_mean_price = train_data[train_data['Category'] == cat]['Average price'].mean()
    train_data.loc[train_data['Category'] == cat, 'Avg_price_per_category'] = cat_mean_price
# print(avg_price_per_category)
# train_data.head()

avg_price_per_category = []
for cat in np.unique(test_data['Category'].values):
    # print(cat)
    avg_price_per_category.append(test_data[test_data['Category'] == cat]['Average price'].mean())
    cat_mean_price = test_data[test_data['Category'] == cat]['Average price'].mean()
    test_data.loc[test_data['Category'] == cat, 'Avg_price_per_category'] = cat_mean_price

Добавим фичу, если средняя цена продажи не выше чем средняя цена по сегменту - 1, иначе 0




In [7]:
train_data['avg_price_not_more_than_evg_cat'] = np.where((train_data['Avg_price_per_category'] >= train_data['Average price']), 1, 0)
train_data['avg_price_not_more_than_evg_cat'] = train_data['avg_price_not_more_than_evg_cat'].astype('object')

test_data['avg_price_not_more_than_evg_cat'] = np.where((test_data['Avg_price_per_category'] >= test_data['Average price']), 1, 0)
test_data['avg_price_not_more_than_evg_cat'] = test_data['avg_price_not_more_than_evg_cat'].astype('object')

ratings = train_data['Rating'].values
num_ratings = [int(rating.split(',')[0]) for rating in ratings]
train_data['Rating'] = num_ratings

ratings = test_data['Rating'].values
num_ratings = [int(rating.split(',')[0]) for rating in ratings]
test_data['Rating'] = num_ratings

In [8]:
# print(f"mean: {train_data['Comments'][:len(train_data['Comments'].values)].mean()}")
print(f"mean: {train_data.sort_values(by='Comments').loc[:len(train_data.sort_values(by='Comments')['Comments'].values)-1, 'Comments'].mean()}")

# train_data.sort_values(by='Comments').loc[:len(train_data.sort_values(by='Comments')['Comments'].values)-1, 'Comments'].value_counts()

mean: 4.590824942430126


In [9]:
train_data['comments_more_than_avg'] = (train_data['Comments'] > 5).astype('object')
# pd.crosstab(train_data['comments_more_than_avg'], train_data['Sales'], margins=True)
test_data['comments_more_than_avg'] = (test_data['Comments'] > 5).astype('object')

Удаляем элементы с продажами более 1000 (меньше 1% всех записей)

In [10]:

# train_data = train_data[train_data['Sales'] < 1000.0]


In [11]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25062 entries, 0 to 25061
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Name                             25062 non-null  object 
 1   Category                         25062 non-null  object 
 2   Brand                            25061 non-null  object 
 3   Seller                           25031 non-null  object 
 4   Color                            127 non-null    object 
 5   Comments                         25062 non-null  float64
 6   Final price                      25062 non-null  float64
 7   Max price                        25062 non-null  float64
 8   Min price                        25062 non-null  float64
 9   Average price                    19264 non-null  float64
 10  Sales                            25062 non-null  float64
 11  Days in stock                    25062 non-null  float64
 12  Days with sales   

In [12]:
def deleteId(ds):
    return ds.drop(columns='Id')

def deleteNaN(train_ds, test_ds, critval):
    fullsize = train_ds.shape[0]
    new_train = train_ds.copy()
    new_test = test_ds.copy()
    for feature in new_train.columns:
        nulls = new_train[feature].isnull().sum()
        percent = nulls / fullsize
        # если доля пустых значений превышает critval - столбец не информативен,
        # можно его выбросить
        if (percent > critval):
            new_train = new_train.drop(columns=feature)
            new_test = new_test.drop(columns=feature)
    return new_train, new_test

def convertToNumeric(train_ds, test_ds):
    new_train = train_ds.copy()
    new_test = test_ds.copy()
    LE = LabelEncoder()
    for feature in new_train.columns:
        if (new_train[feature].dtype == 'object'):
            new_train[feature] = LE.fit_transform(new_train[feature])
            new_test[feature] = LE.fit_transform(new_test[feature])
    return new_train, new_test

train = train_data.drop_duplicates()

# train = train.drop(columns='Id')
train = train.drop(columns='Color')
train = train.drop(columns='Base price')
train = train.drop(columns='Basic Sale Price')

test = test_data.drop(columns='Id')
test = test.drop(columns='Color')
test = test.drop(columns='Base price')
test = test.drop(columns='Basic Sale Price')

train, test = deleteNaN(train, test, critval=0.8)
train, test = convertToNumeric(train, test)

train_y = train['Sales'].values
train_X = train.drop(columns='Sales').values
val_test = test.values

train_x, test_x, train_y, test_y = train_test_split(train_X, train_y, test_size=0.1, random_state=7)  # random_state=98987)

#Избавимся от NaN значений


In [13]:
strategies = ['mean', 'median', 'most_frequent']
imputer = SimpleImputer(strategy=strategies[2])
trainX = imputer.fit_transform(train_x)
testX = imputer.fit_transform(test_x)
val_test_x = imputer.fit_transform(val_test)

In [14]:
scaler = StandardScaler()
trainX = scaler.fit_transform(trainX)
testX = scaler.transform(testX)
val_test_x = scaler.transform(val_test_x)

In [15]:
trainX.shape

(22086, 17)

In [16]:
val_test.shape

(10741, 17)

#Метрика **SMAPE**

In [17]:
def smape(A, F):
    with np.errstate(divide='ignore', invalid='ignore'):
        tmp = 2 * np.abs(F-A) / (np.abs(A) + np.abs(F))
    tmp[np.isnan(tmp)] = 0
    return np.sum(tmp) / len(tmp) * 100

In [18]:
def check_error(preds, gt):
    print('SMAPE Error:', smape(np.round(preds), gt))
    # print('RMSE Error:', mean_squared_error(np.round((np.abs(preds))), gt, squared=False))

#Random Forest

In [19]:
# parameters = {
#     'criterion':(['absolute_error']), 
#     'max_depth':  [100, 500, 100],
#     'n_estimators': [40, 50, 60, 55],
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': range(10, 71, 20)}

# model = RandomForestRegressor()
# rf_gs_model = GridSearchCV(model, parameters)
# rf_gs_model.fit(trainX[:300], train_y[:300])
# print(f"Best parameters: {rf_gs_model.best_params_}")

In [20]:
last_rf_best_params = {
    'criterion': 'absolute_error', 
    'max_depth': 350,  # 300
    'n_estimators': 70, # 70
    'min_samples_leaf': 4, # 3
    'min_samples_split': 50 # 50
}

# random_forest = RandomForestRegressor(**rf_gs_model.best_params_)
random_forest = RandomForestRegressor(**last_rf_best_params) 
random_forest.fit(trainX, train_y)
check_error(random_forest.predict(testX), test_y)

SMAPE Error: 20.986130844369683


In [21]:
check_error(random_forest.predict(testX), test_y)
print(random_forest.predict(testX))

SMAPE Error: 20.986130844369683
[605.60714286  17.66428571  85.54285714 ...   0.          38.27857143
  23.07142857]


#XGBoost

In [22]:
# parameters = {
#     "learning_rate": [0.1, 0.01, 0.075, 0.015],
#     "max_depth": [3, 4, 10, 100],
#     "min_child_weight": [3, 5, 7, 10],
#     # "n_estimators": range(10, 51, 10),
#     "subsample": [0.6, 0.7, 0.8, 0.9, 1.],
#     "gamma": [0.5, 0.7, 0.3, 1.],
#     "reg_lambda": [0.6, 0.7, 0.9, 1.0],
#     'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.]
# }

# model = XGBRegressor(objective='reg:squarederror')
# xgb_gs_model = GridSearchCV(model, parameters)
# xgb_gs_model.fit(new_trainX, train_y)
# print(f"Best parameters: {xgb_gs_model.best_params_}")

# gridcv_xgb = xgb_gs_model.best_estimator_
# check_error(np.round(gridcv_xgb.predict(new_testX)), test_y)

In [23]:
# last_xgb_best_params = {
#     "learning_rate": 0.035,
#     "max_depth": 350,
#     "min_child_weight": 14,
#     "n_estimators": 50, 
#     "subsample": 0.9,
#     # "booster": 'gbtree',
#     "reg_lambda": 1.,
#     "gamma": 0.3,
#     'colsample_bytree': 0.9
# }
# xgb_reg = XGBRegressor(**last_xgb_best_params) 
# # xgb_reg = XGBRegressor(**xgb_gs_model.best_params_)
# xgb_reg.fit(trainX, train_y)
# check_error(np.round(xgb_reg.predict(testX)), test_y)

# # xgb_reg.fit(new_trainX, train_y)
# # check_error(np.round(xgb_reg.predict(new_testX)), test_y)

#Соревы

In [24]:
submission_path = '../submissions/submission.csv'
submission = pd.read_csv(submission_path)

In [25]:
predictions = np.round(random_forest.predict(val_test_x))
submission['Expected'] = predictions
submission
submission.to_csv('../submissions/submission.csv', index=False)

In [26]:
# predictions = np.round(xgb_reg.predict(val_test_x))
# submission['Expected'] = predictions
# submission
# submission.to_csv('submission_xgboost.csv', index=False)