# <center>⚡️ Quick start 🏁</center>

In [None]:
from src.feature_engineering import *

In [None]:
import warnings

warnings.filterwarnings('ignore')

## Данные

### Описание столбцов (**ОБЯЗАТЕЛЬНО ПОТРАТИТЬ НА ЭТО ВРЕМЯ**)

In [None]:
train = pd.read_csv("../data/train.csv", nrows=None, parse_dates=[])
test = pd.read_csv("../data/test.csv", parse_dates=[])

### Знакомство

In [None]:
train.head()

In [None]:
train.dtypes

In [None]:
train.isnull().sum()

### Пропуски

### Выбросы (query boxplot hist)

### Разделение

In [None]:
target_name = ''
target = train[target_name].reset_index(drop=True)
train.drop(target_name, axis=1, inplace=True)

dataset = pd.concat([train, test], ignore_index=True)

In [None]:
train_size = train.shape[0]

## **Feature engineering**

### **GeoFeatures** (генерация признаков по ширине-долготе)

In [None]:
# Инициализируем точки интереса и радиус Имя: [(Ширина, Долгота), Радиус]
points = {}

In [None]:
dataset = add_1geo_features(dataset, 'pickup', points)
dataset.sample(3)

In [None]:
dataset = add_1geo_features(dataset, 'dropoff', points)
dataset.sample(3)

In [None]:
dataset = add_2geo_features(dataset, 'pickup', 'dropoff')
dataset.sample(3)

### **Timestamp** (обработка временных фич)

In [None]:
dataset = add_1time_features(dataset, 'pickup_datetime')
dataset.sample(3)

### ***Memory***

In [None]:
def reduce_mem_usage(df: pd.DataFrame) -> pd.DataFrame:  
    """   
    Проходит по всем столбцам DataFrame и изменяет тип данных  
    для уменьшения использования памяти.  
    """  
    start_mem = df.memory_usage().sum() / 1024**2  
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))  

    for col in df.columns:  
        col_type = df[col].dtype  

        # Пропускаем объектные, категориальные и datetime64 столбцы  
        if col_type in [np.object_, 'category', 'datetime64[ns, UTC]']:  
            continue  

        c_min = df[col].min()  
        c_max = df[col].max()  

        # Сжатие целочисленных столбцов  
        if pd.api.types.is_integer_dtype(col_type):  
            if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:  
                df[col] = df[col].astype(np.int8)  
            elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:  
                df[col] = df[col].astype(np.int16)  
            elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:  
                df[col] = df[col].astype(np.int32)  
            else:  
                df[col] = df[col].astype(np.int64)  

        # Сжатие вещественных столбцов  
        elif pd.api.types.is_float_dtype(col_type):  
            if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:  
                df[col] = df[col].astype(np.float16)  
            elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:  
                df[col] = df[col].astype(np.float32)  
            else:  
                df[col] = df[col].astype(np.float64)  

    end_mem = df.memory_usage().sum() / 1024**2  
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))  
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))  
    
    return df

In [None]:
dataset = reduce_mem_usage(dataset)

## **INF** -> **NaN**

In [None]:
dataset.replace([-np.inf, np.inf], np.nan, inplace=True)

## ***Total***

In [None]:
nunique = dataset.nunique()
nunique[nunique == 1]

In [None]:
features2drop = []
text_features = []
data_features = dataset.select_dtypes('datetime64[ns, UTC]').columns.drop(features2drop, errors='ignore').tolist() # временные признаки
filter_features = dataset.columns.drop(features2drop + data_features, errors='ignore').tolist()
cat_features = dataset.select_dtypes("object").columns.drop(text_features + features2drop, errors='ignore').tolist() # категориальные признаки
num_features = dataset.select_dtypes("number").columns.drop(features2drop, errors='ignore').tolist() # численные признаки

print('features2drop :', len(features2drop), features2drop)
print('data_features :', len(data_features), data_features)
print('cat_features :', len(cat_features), cat_features)
print('text_features :', len(text_features), text_features)
print('num_features :', len(num_features), num_features)

## *Preprocessing*

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
pipeline_preprocessing = Pipeline([
    ('standart_scaler', StandardScaler())
])

In [None]:
dataset[num_features] = pipeline_preprocessing.fit_transform(dataset[num_features]).astype(float)

## Features **cat** -> **num**

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
encoder = OrdinalEncoder()
dataset[cat_features] = encoder.fit_transform(dataset[cat_features]).astype(int)

In [None]:
num_features += cat_features

## Train Test Split

In [None]:
train, test = dataset[:train_size], dataset[train_size:]

## **Models**

In [None]:
# Encoder
from sklearn.preprocessing import LabelEncoder

# Models regressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# Model classifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
class Ensemble:
    def __init__(self, *coef):
        self.labelencoding = LabelEncoder()

        self.catboost = CatBoostClassifier()

        self.lgbm_params = None
        self.lgbm = LGBMClassifier(**self.lgbm_params)

        self.xgb_params = None
        self.xgb = XGBClassifier(**self.xgb_params)

        self.randomforest_params = None
        self.randomforest = RandomForestClassifier(**self.randomforest_params)

        self.models = [self.catboost, self.lgbm, self.xgb, self.randomforest]
        self.coef = coef
        assert len(coef) == len(self.models)

    def fit(self, X, y):
        y = self.labelencoding.fit_transform(y)
        for i, model in enumerate(self.models):
            if self.coef[i]:
                model.fit(X, y)

    def predict_proba(self, X):
        proba = 0
        for i, model in enumerate(self.models):
            if self.coef[i]:
                proba += model.predict_proba(X) * self.coef[i]
        
        return proba
    
    def get_params(self):
        result = f'{self.coef}'

        if self.coef[0]:
            result += f'\n - CatBoost: {self.catboost.get_params()}'

        if self.coef[1]:
            result += f'\n - LGBM: {self.lgbm_params}'

        if self.coef[2]:
            result += f'\n - XGB: {self.xgb_params}'

        if self.coef[3]:
            result += f'\n - RandomForest: {self.randomforest_params}'


        return result


### Feature selection

In [None]:
selection_model = None

selection_model.fit(train[filter_features], target)

In [None]:
importances = selection_model.get_feature_importance(prettified=True)

In [None]:
importances

In [None]:
importances[importances['Importances'] == 0]

In [None]:
count_features = None 
selected_filter_features = importances['Feature Id'][:count_features]
selected_cat_features = list(filter(lambda feature: feature in selected_filter_features, cat_features))

### Task

In [None]:
X = train[selected_filter_features]
y = target

### *Validation*

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error

In [None]:
scores = []
clfs = []
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for i, (train_index, test_index) in enumerate(kf.split(X)):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    clf = None

    clf.fit(X_train, y_train, eval_set=(X_test, y_test))

    y_pred = clf.predict(X_test)

    score = root_mean_squared_error(y_test, y_pred)
    print(f"Fold {i + 1} score: {score}\n")

    scores.append(score)
    clfs.append(clf)

# Считаем среднее и дисперсию по всем фолдам
print(f"Score: {np.mean(scores).round(4)}\n")

## **Submission**

In [None]:
# Получаем предсказания от каждого классификатора
predict = [clf.predict(test[selected_filter_features]) for clf in clfs]

# Преобразуем список предсказаний в массив для удобства осреднения
predict = np.mean(predict, axis=0)

### **Save** (обязательно прописать файл описания)

In [None]:
import os

sub = pd.DataFrame(predict)

description_path = "../subs/description.txt"
is_exist = os.path.exists(description_path)

with open("../subs/description.txt", "r+" if is_exist else "w", encoding='utf-8') as file:
    if is_exist:
        data = ''.join(file.readlines()).split('\n\n')

    if is_exist and data[-1]:
        file.write("\n\n")
        id = round(float(data[-1].split('\n')[0][4:]) + 0.1, 1)
    else:
        id = 1.0

    file.write(f"ID: {id}\n")
    file.write(f"LeaderBord Score: ")

    sub.to_csv(f"../subs/submission_{id}.tsv", sep="\t")