In [1]:
# Import necessary libraries
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
import os

In [3]:
train_data = pd.read_csv('dataset/train.csv')

In [4]:
X = train_data.drop(columns=['target', 'row_id'])  # Remove non-feature columns from the training data
y = train_data['target'].fillna(train_data['target'].median())  # Fill missing target values with the median

In [7]:
imputer = SimpleImputer(strategy='mean')  # Use mean strategy for imputing missing values
X_imputed = imputer.fit_transform(X)  # Fit and transform the training features

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

In [9]:
model = RandomForestRegressor(n_estimators=5, random_state=42)  # Initialize RandomForestRegressor with 5 trees
model.fit(X_train, y_train)  # Train the model on the training data

In [10]:
y_pred = model.predict(X_test)  # Predict target values for the test set

In [11]:
mae = mean_absolute_error(y_test, y_pred)  # Compute MAE between true and predicted values
print(f'Mean Absolute Error: {mae}')  # Print the MAE value

Mean Absolute Error: 6.634945269873497


In [12]:
submission_data = pd.read_csv('dataset/test.csv')

In [13]:
submission_data = submission_data.drop(columns=['row_id'])  # Remove non-feature column 'row_id'

In [14]:
submission_data = submission_data[X.columns]  # Align test features with training features

In [15]:
submission_imputed = imputer.transform(submission_data)  # Transform the test data using the same imputer

In [16]:
submission_preds = model.predict(submission_imputed)  # Predict target values for the test set

In [18]:
# Add 'row_id' column to the submission DataFrame
submission = pd.DataFrame({
    'time_id': pd.read_csv('dataset/test.csv')['time_id'],  # Retrieve 'time_id' from test data
    'row_id': pd.read_csv('dataset/test.csv')['row_id'],  # Retrieve 'row_id' from test data
    'target': submission_preds  # Add predictions to the DataFrame
})

# Create a directory for submission if it does not exist
submission_dir = 'submissions/'
os.makedirs(submission_dir, exist_ok=True)

# Save the submission file to the specified path
submission_path = os.path.join(submission_dir, 'submission.csv')
submission.to_csv(submission_path, index=False)  # Write the submission DataFrame to a CSV file
print(f'Submission saved to {submission_path}')  # Print the path where the submission file is saved

Submission saved to submissions/submission.csv
