In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Load the data
df_stores = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/stores.csv")
df_transactions = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/transactions.csv")
df_oil = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/oil.csv")
df_holidays = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv")
df_test = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv")
df_train = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/train.csv")

# Calculate total sales by family and get the top 3 families
sales_by_family = df_train.groupby('family')['sales'].sum()
top_3_sales = sales_by_family.sort_values(ascending=False).head(3)

# Create a bar chart with custom colors
colors = ['red' if family in top_3_sales.index else 'blue' for family in sales_by_family.index]
sales_by_family.plot(kind='bar', figsize=(10, 6), color=colors)
plt.title('Total Sales by Family (Top 3 in Red)')
plt.xlabel('Family')
plt.ylabel('Sales')
plt.show()

# Data cleaning for the oil dataset
df_oil.isnull().sum()
new_oil = df_oil.dropna()

# Merge with training data by date
merge_df = pd.merge(df_train, new_oil, how='left', on=['date'])
merge_df.dropna(inplace=True)
merge_df['date'] = pd.to_numeric(pd.to_datetime(merge_df['date']))

# Encode the categorical variable 'family' to numerical values
le = LabelEncoder()
merge_df['family'] = le.fit_transform(merge_df['family'])

# Split the data into training and testing sets
X = merge_df.drop(['id', 'store_nbr', 'sales'], axis=1).values
y = merge_df['sales'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Train a linear regression model
regression = LinearRegression()
regression.fit(X_train, y_train)

# Calculate the coefficient of determination (R²) of the model
r_squared = regression.score(X_test, y_test)
print('Coefficient of Determination (R²):', r_squared)

# Prepare the test data for making predictions
df_test['date'] = pd.to_numeric(pd.to_datetime(df_test['date']))
df_test['family'] = le.transform(df_test['family'])
test = df_test.drop(['store_nbr'], axis=1)

# Make predictions and count negative predictions
predictions = regression.predict(test.values)
num_negative_predictions = sum(predictions <= 0)
print('Total number of prediction rows:', len(predictions))
print('Number of negative predictions:', num_negative_predictions)

# Change negative predictions to zero (if necessary)
predictions[predictions <= 0] = 0

# Create a DataFrame for the final predictions and save to a CSV file
submission = {'id': df_test['id'], 'sales': predictions}
df_submission = pd.DataFrame(submission)
df_submission.to_csv('submission.csv', index=False)
