In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [30]:
# Load the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')


In [31]:
# Convert 'Timestamp' to datetime
train_df['Timestamp'] = pd.to_datetime(train_df['Timestamp'])
test_df['Timestamp'] = pd.to_datetime(test_df['Timestamp'])


In [32]:
# Clean 'VAT' and 'Barrel_Price' columns
train_df['VAT'] = pd.to_numeric(train_df['VAT'], errors='coerce')
train_df['Barrel_Price'] = pd.to_numeric(train_df['Barrel_Price'], errors='coerce')
test_df['VAT'] = pd.to_numeric(test_df['VAT'], errors='coerce')
test_df['Barrel_Price'] = pd.to_numeric(test_df['Barrel_Price'], errors='coerce')


In [33]:
# Handling missing values
train_df['Demand_Fuel'].fillna(train_df['Demand_Fuel'].mean(), inplace=True)
train_df['Excise_Duty'].fillna(train_df['Excise_Duty'].mean(), inplace=True)
train_df['Refine_Cost'].fillna(train_df['Refine_Cost'].mean(), inplace=True)
train_df['Per_Change'].fillna(train_df['Per_Change'].mean(), inplace=True)
train_df['VAT'].fillna(train_df['VAT'].mean(), inplace=True)
train_df['Barrel_Price'].fillna(train_df['Barrel_Price'].mean(), inplace=True)


In [34]:
test_df['Demand_Fuel'].fillna(test_df['Demand_Fuel'].mean(), inplace=True)
test_df['Excise_Duty'].fillna(test_df['Excise_Duty'].mean(), inplace=True)
test_df['Refine_Cost'].fillna(test_df['Refine_Cost'].mean(), inplace=True)
test_df['Per_Change'].fillna(test_df['Per_Change'].mean(), inplace=True)
test_df['VAT'].fillna(test_df['VAT'].mean(), inplace=True)
test_df['Barrel_Price'].fillna(test_df['Barrel_Price'].mean(), inplace=True)

In [35]:
# Feature engineering: Extract year, month, day, hour from 'Timestamp'
train_df['Year'] = train_df['Timestamp'].dt.year
train_df['Month'] = train_df['Timestamp'].dt.month
train_df['Day'] = train_df['Timestamp'].dt.day
train_df['Hour'] = train_df['Timestamp'].dt.hour

test_df['Year'] = test_df['Timestamp'].dt.year
test_df['Month'] = test_df['Timestamp'].dt.month
test_df['Day'] = test_df['Timestamp'].dt.day
test_df['Hour'] = test_df['Timestamp'].dt.hour

In [36]:

# Drop the original 'Timestamp' column
train_df.drop(columns=['Timestamp'], inplace=True)
test_df.drop(columns=['Timestamp'], inplace=True)


In [37]:
# Define features and target
X = train_df.drop(columns=['Price'])
y = train_df['Price']

In [38]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForest model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [39]:
# Evaluate the model
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
score = max(0, 100 * (1 - rmse))
print(f'Validation RMSE: {rmse}, Score: {score}')

Validation RMSE: 0.2986573244346621, Score: 70.13426755653379


In [40]:
# Predict on test set
predictions = model.predict(test_df)

In [41]:


# Create submission file
submission = pd.DataFrame({
    'Timestamp':test['Timestamp'],
    'Price': predictions
})

submission.to_csv('submission1.csv', index=False)

In [42]:
test = pd.read_csv('test.csv')

In [43]:
test.columns

Index(['Timestamp', 'Demand_Fuel', 'Supply_Fuel', 'Excise_Duty', 'VAT',
       'Barrel_Price', 'Refine_Cost', 'Per_Change'],
      dtype='object')

xgb regressor

In [53]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

In [45]:
model_xgb = XGBRegressor(
    n_estimators=100,      # Number of trees
    learning_rate=0.1,     # Step size shrinkage
    max_depth=3,           # Maximum depth of a tree
    subsample=0.8,         # Subsample ratio of the training instances
    colsample_bytree=0.8,  # Subsample ratio of columns when constructing each tree
    random_state=42        # Seed for reproducibility
)


In [46]:
model_xgb.fit(X_train, y_train)


In [47]:
# Evaluate the model
y_pred_xgb = model_xgb.predict(X_val)
rmse_xgb = np.sqrt(mean_squared_error(y_val, y_pred_xgb))
score_xgb = max(0, 100 * (1 - rmse_xgb))
print(f'Validation RMSE: {rmse_xgb}, Score: {score_xgb}')


Validation RMSE: 0.2922464206856748, Score: 70.77535793143252


In [48]:
from sklearn.model_selection import GridSearchCV

In [49]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

In [57]:
# Initialize GridSearchCV
grid_search = GridSearchCV(XGBRegressor(), param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)

# Fit the model
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [58]:
# Best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score}")

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Best Score: -0.0821430629791805


In [60]:
# Initialize the model with the best parameters
best_model = XGBRegressor(**best_params)

# Fit the model
best_model.fit(X_train, y_train)

# Predict on the test set
y_pred_best = best_model.predict(X_val)




In [61]:
# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score

mse_best = mean_squared_error(y_val, y_pred)
r2_best = r2_score(y_val, y_pred_best)

print(f"Mean Squared Error: {mse_best}")
print(f"R-squared: {r2_best}")

Mean Squared Error: 0.089196197438471
R-squared: 0.12536145003846966


In [62]:
# Evaluate the model
rmse_best = np.sqrt(mean_squared_error(y_val, y_pred_best))
score_best = max(0, 100 * (1 - rmse_best))
print(f'Validation RMSE: {rmse_best}, Score: {score_best}')


Validation RMSE: 0.29215049300224705, Score: 70.7849506997753
