In [1]:
from data_preprocess import data_preprocess, get_training_data, get_input_data, prepare_submission
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# Load data

In [2]:
data = data_preprocess()
X_train, y_train = get_training_data(data)
X_train = X_train.drop(columns=['time', 'date_calc'])

# Pipeline

In [4]:
numeric_features = X_train.select_dtypes(include=['float32']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

# Define preprocessor
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


# Linear regression

In [9]:
from sklearn.linear_model import LinearRegression

# Define model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LinearRegression())])

# Fitting the model to your data
model.fit(X_train, y_train)

# Making predictions
predictions = model.predict(X_train)

scores = cross_val_score(model, X_train, y_train, cv=5)
mse = mean_squared_error(y_train, predictions)

print("MSE on training data:", mse)

MSE on training data: 246093.64294917576


# Random forest

In [7]:
from sklearn.ensemble import RandomForestRegressor

# Define model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestRegressor())])

# Fitting the model to your data
model.fit(X_train, y_train)

# Making predictions
predictions = model.predict(X_train)

scores = cross_val_score(model, X_train, y_train, cv=5)
mse = mean_squared_error(y_train, predictions)
print("MSE on training data:", mse)

MSE on training data: 9604.300430339845


In [None]:
X_test = get_input_data()
predictions = model.predict(X_test.values)

submission = prepare_submission(X_test, predictions)
submission.to_csv('submissions/mlpgr_regressor.csv', index=False)