In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
train_df = pd.read_csv('/home/enat/Downloads/rossmann-store-sales/train.csv')
test_df = pd.read_csv('/home/enat/Downloads/rossmann-store-sales/test.csv')

# Display the first few rows of the train dataset
print("Train Data:")
print(train_df.head())

# Display the first few rows of the test dataset
print("\nTest Data:")
print(test_df.head())


Train Data:
   Store  DayOfWeek        Date  Sales  Customers  Open  Promo StateHoliday  \
0      1          5  2015-07-31   5263        555     1      1            0   
1      2          5  2015-07-31   6064        625     1      1            0   
2      3          5  2015-07-31   8314        821     1      1            0   
3      4          5  2015-07-31  13995       1498     1      1            0   
4      5          5  2015-07-31   4822        559     1      1            0   

   SchoolHoliday  
0              1  
1              1  
2              1  
3              1  
4              1  

Test Data:
   Id  Store  DayOfWeek        Date  Open  Promo StateHoliday  SchoolHoliday
0   1      1          4  2015-09-17   1.0      1            0              0
1   2      3          4  2015-09-17   1.0      1            0              0
2   3      7          4  2015-09-17   1.0      1            0              0
3   4      8          4  2015-09-17   1.0      1            0              0
4 

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
# Convert non-numeric columns to numeric (only for columns that exist)
def convert_to_numeric(df, is_train=True):
    df['Store'] = df['Store'].astype(int)
    if is_train:
        df['Sales'] = df['Sales'].astype(float)
        df['Customers'] = df['Customers'].astype(int)
    df['DayOfWeek'] = df['DayOfWeek'].astype(int)
    df['Open'] = df['Open'].astype(float)
    df['Promo'] = df['Promo'].astype(int)
    df['SchoolHoliday'] = df['SchoolHoliday'].astype(int)
    return df

train_df = convert_to_numeric(train_df, is_train=True)
test_df = convert_to_numeric(test_df, is_train=False)

# Display the processed data
print("\nProcessed Train Data:")
print(train_df.head())
print("\nProcessed Test Data:")
print(test_df.head())




Processed Train Data:
   Store  DayOfWeek        Date    Sales  Customers  Open  Promo StateHoliday  \
0      1          5  2015-07-31   5263.0        555   1.0      1            0   
1      2          5  2015-07-31   6064.0        625   1.0      1            0   
2      3          5  2015-07-31   8314.0        821   1.0      1            0   
3      4          5  2015-07-31  13995.0       1498   1.0      1            0   
4      5          5  2015-07-31   4822.0        559   1.0      1            0   

   SchoolHoliday  
0              1  
1              1  
2              1  
3              1  
4              1  

Processed Test Data:
   Id  Store  DayOfWeek        Date  Open  Promo StateHoliday  SchoolHoliday
0   1      1          4  2015-09-17   1.0      1            0              0
1   2      3          4  2015-09-17   1.0      1            0              0
2   3      7          4  2015-09-17   1.0      1            0              0
3   4      8          4  2015-09-17   1.0     

In [4]:
# Handle NaN values
def handle_missing_values(df):
    df.fillna(df.median(), inplace=True)
    return df

train_df = handle_missing_values(train_df)
test_df = handle_missing_values(test_df)

# Display the processed data
print("\nProcessed Train Data (NaN handled):")
print(train_df.head())
print("\nProcessed Test Data (NaN handled):")
print(test_df.head())



Processed Train Data (NaN handled):
   Store  DayOfWeek        Date    Sales  Customers  Open  Promo StateHoliday  \
0      1          5  2015-07-31   5263.0        555   1.0      1            0   
1      2          5  2015-07-31   6064.0        625   1.0      1            0   
2      3          5  2015-07-31   8314.0        821   1.0      1            0   
3      4          5  2015-07-31  13995.0       1498   1.0      1            0   
4      5          5  2015-07-31   4822.0        559   1.0      1            0   

   SchoolHoliday  
0              1  
1              1  
2              1  
3              1  
4              1  

Processed Test Data (NaN handled):
   Id  Store  DayOfWeek        Date  Open  Promo StateHoliday  SchoolHoliday
0   1      1          4  2015-09-17   1.0      1            0              0
1   2      3          4  2015-09-17   1.0      1            0              0
2   3      7          4  2015-09-17   1.0      1            0              0
3   4      8      

  df.fillna(df.median(), inplace=True)
  df.fillna(df.median(), inplace=True)


In [5]:
# Extract new features from datetime columns
def extract_features(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Weekday'] = df['Date'].dt.weekday
    df['Weekend'] = df['Weekday'] >= 5
    df['Day_of_Month'] = df['Date'].dt.day
    df['Month'] = df['Date'].dt.month
    df['Year'] = df['Date'].dt.year
    df['Is_Beginning_Month'] = df['Date'].dt.day <= 10
    df['Is_Mid_Month'] = (df['Date'].dt.day > 10) & (df['Date'].dt.day <= 20)
    df['Is_End_Month'] = df['Date'].dt.day > 20
    return df

train_df = extract_features(train_df)
test_df = extract_features(test_df)

# Display the new features
print("\nTrain Data with New Features:")
print(train_df.head())
print("\nTest Data with New Features:")
print(test_df.head())



Train Data with New Features:
   Store  DayOfWeek       Date    Sales  Customers  Open  Promo StateHoliday  \
0      1          5 2015-07-31   5263.0        555   1.0      1            0   
1      2          5 2015-07-31   6064.0        625   1.0      1            0   
2      3          5 2015-07-31   8314.0        821   1.0      1            0   
3      4          5 2015-07-31  13995.0       1498   1.0      1            0   
4      5          5 2015-07-31   4822.0        559   1.0      1            0   

   SchoolHoliday  Weekday  Weekend  Day_of_Month  Month  Year  \
0              1        4    False            31      7  2015   
1              1        4    False            31      7  2015   
2              1        4    False            31      7  2015   
3              1        4    False            31      7  2015   
4              1        4    False            31      7  2015   

   Is_Beginning_Month  Is_Mid_Month  Is_End_Month  
0               False         False          

In [9]:
# Scale the train data
scaler = StandardScaler()
# Selecting numeric features for scaling in the train data
numeric_features_train = ['DayOfWeek', 'Open', 'Promo', 'SchoolHoliday', 'Weekday', 'Day_of_Month', 'Month', 'Year', 'Sales', 'Customers']

# Fit and transform the train data
train_df[numeric_features_train] = scaler.fit_transform(train_df[numeric_features_train])

# Display scaled train data
print("\nScaled Train Data:")
print(train_df.head())



Scaled Train Data:
   Store  DayOfWeek       Date     Sales  Customers      Open     Promo  \
0      1   0.501484 2015-07-31 -0.132683  -0.168269  0.452399  1.273237   
1      2   0.501484 2015-07-31  0.075373  -0.017540  0.452399  1.273237   
2      3   0.501484 2015-07-31  0.659800   0.404499  0.452399  1.273237   
3      4   0.501484 2015-07-31  2.135414   1.862258  0.452399  1.273237   
4      5   0.501484 2015-07-31 -0.247231  -0.159656  0.452399  1.273237   

  StateHoliday  SchoolHoliday   Weekday  Weekend  Day_of_Month     Month  \
0            0       2.144211  0.501484    False      1.740766  0.346724   
1            0       2.144211  0.501484    False      1.740766  0.346724   
2            0       2.144211  0.501484    False      1.740766  0.346724   
3            0       2.144211  0.501484    False      1.740766  0.346724   
4            0       2.144211  0.501484    False      1.740766  0.346724   

       Year  Is_Beginning_Month  Is_Mid_Month  Is_End_Month  
0  1.50207

In [10]:
# Scale the test data
scaler_test = StandardScaler()
# Selecting numeric features for scaling in the test data
numeric_features_test = ['DayOfWeek', 'Open', 'Promo', 'SchoolHoliday', 'Weekday', 'Day_of_Month', 'Month', 'Year']

# Fit and transform the test data separately
test_df[numeric_features_test] = scaler_test.fit_transform(test_df[numeric_features_test])

# Display scaled test data
print("\nScaled Test Data:")
print(test_df.head())



Scaled Test Data:
   Id  Store  DayOfWeek       Date      Open     Promo StateHoliday  \
0   1      1   0.010337 2015-09-17  0.412874  1.235442            0   
1   2      3   0.010337 2015-09-17  0.412874  1.235442            0   
2   3      7   0.010337 2015-09-17  0.412874  1.235442            0   
3   4      8   0.010337 2015-09-17  0.412874  1.235442            0   
4   5      9   0.010337 2015-09-17  0.412874  1.235442            0   

   SchoolHoliday   Weekday  Weekend  Day_of_Month     Month  Year  \
0      -0.892695  0.010337    False      0.411816  1.350381   0.0   
1      -0.892695  0.010337    False      0.411816  1.350381   0.0   
2      -0.892695  0.010337    False      0.411816  1.350381   0.0   
3      -0.892695  0.010337    False      0.411816  1.350381   0.0   
4      -0.892695  0.010337    False      0.411816  1.350381   0.0   

   Is_Beginning_Month  Is_Mid_Month  Is_End_Month  
0               False          True         False  
1               False          True

In [None]:
# Save the preprocessed train and test data
train_df.to_csv('preprocessed_train.csv', index=False)
test_df.to_csv('preprocessed_test.csv', index=False)

print("Preprocessed data saved.")
