In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/store-sales-time-series-forecasting/oil.csv
/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv
/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv
/kaggle/input/store-sales-time-series-forecasting/stores.csv
/kaggle/input/store-sales-time-series-forecasting/train.csv
/kaggle/input/store-sales-time-series-forecasting/test.csv
/kaggle/input/store-sales-time-series-forecasting/transactions.csv


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder  # Import LabelEncoder
import warnings

# Ignore specific warning messages
warnings.filterwarnings("ignore", category=DeprecationWarning)



In [3]:
# Load the data
data_dir = '/kaggle/input/store-sales-time-series-forecasting'
train_df = pd.read_csv(f"{data_dir}/train.csv")
test_df = pd.read_csv(f"{data_dir}/test.csv")
oil_df = pd.read_csv(f"{data_dir}/oil.csv")
holidays_events_df = pd.read_csv(f"{data_dir}/holidays_events.csv")
stores_df = pd.read_csv(f"{data_dir}/stores.csv")
transactions_df  = pd.read_csv(f"{data_dir}/transactions.csv")

In [4]:
display(train_df.head(5))
display(test_df.head(5))
display(oil_df.head(5))
display(holidays_events_df.head(5))
display(stores_df.head(5))
display(transactions_df.head(5))

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0


Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False


Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [5]:
# Prepare the datasets
train_ds = train_df.iloc[:, [2, 3, 5]].fillna(train_df.mode().iloc[0])
test_ds = test_df.iloc[:, [2, 3, 4]].fillna(test_df.mode().iloc[0])

In [6]:
# Encoding categorical data
label_encoder = LabelEncoder()
train_ds = train_ds.apply(lambda col: label_encoder.fit_transform(col) if col.dtype == 'object' else col)
test_ds = test_ds.apply(lambda col: label_encoder.fit_transform(col) if col.dtype == 'object' else col)

In [7]:
# Prepare target variable
target = train_df['sales']
X = train_ds.values
y = target.values

In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

# Make predictions on the test set
final_test_pred = model.predict(test_ds)

# Prepare submission
submission = pd.DataFrame({'id': test_df['id'].values, 'sales': final_test_pred})
submission.to_csv("submission.csv", index=False)


Mean Squared Error: 1003082.1567529488
R-squared Score: 0.1928929867190211




In [10]:
from xgboost import XGBRegressor  # Import XGBRegressor

# Create and train the XGBoost regressor model
model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)  # You can adjust hyperparameters
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

# Make predictions on the test set
final_test_pred = model.predict(test_ds)

# Prepare submission
submission = pd.DataFrame({'id': test_df['id'].values, 'sales': final_test_pred})
submission.to_csv("submission_1.csv", index=False)


Mean Squared Error: 241034.52552589605
R-squared Score: 0.8060571064043783


In [11]:
from sklearn.ensemble import RandomForestRegressor  # Import RandomForestRegressor

# Create and train the Random Forest regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)  # You can adjust hyperparameters
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

# Make predictions on the test set
final_test_pred = model.predict(test_ds)

# Prepare submission
submission = pd.DataFrame({'id': test_df['id'].values, 'sales': final_test_pred})
submission.to_csv("submission_2.csv", index=False)


Mean Squared Error: 216392.55882021115
R-squared Score: 0.8258846988057583




In [12]:
from sklearn.ensemble import GradientBoostingRegressor  # Import GradientBoostingRegressor

# Create and train the Gradient Boosting regressor model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)  # You can adjust hyperparameters
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

# Make predictions on the test set
final_test_pred = model.predict(test_ds)

# Prepare submission
submission = pd.DataFrame({'id': test_df['id'].values, 'sales': final_test_pred})
submission.to_csv("submission_3.csv", index=False)

Mean Squared Error: 417481.198918567
R-squared Score: 0.664083344228886


