## Sales Prediction using Machine Learning in Python

### Loading the Data

In [None]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/mnt/data/advertising.csv')

data.head()

### Exploratory Data Analysis (EDA)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Data Summary
summary = data.describe()

# Check for Missing Values
missing_values = data.isnull().sum()

# Visualization
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 6))

sns.scatterplot(x='TV', y='Sales', data=data, ax=axes[0])
axes[0].set_title('TV vs Sales')

sns.scatterplot(x='Radio', y='Sales', data=data, ax=axes[1])
axes[1].set_title('Radio vs Sales')

sns.scatterplot(x='Newspaper', y='Sales', data=data, ax=axes[2])
axes[2].set_title('Newspaper vs Sales')

plt.tight_layout()
plt.show()

summary, missing_values

### Building a Sales Prediction Model

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the data into training and testing sets (80% train, 20% test)
X = data[['TV', 'Radio', 'Newspaper']]
y = data['Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Model Building
lm = LinearRegression()
lm.fit(X_train, y_train)

# Predictions on the test set
y_pred = lm.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
r2 = r2_score(y_test, y_pred)

mse, rmse, r2

### Interpretation of Coefficients

In [None]:
# Getting the coefficients and intercept
coefficients = lm.coef_
intercept = lm.intercept_

coefficients, intercept

### Residual Analysis

In [None]:

import seaborn as sns

# Plotting the residuals
sns.residplot(y_pred, residuals, lowess=True, line_kws={'color': 'red', 'lw': 1})
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.title('Residuals vs Fitted values')
plt.show()


### Feature Engineering

In [None]:

# Adding interaction terms
X_train['TV_Radio'] = X_train['TV'] * X_train['Radio']
X_train['TV_Newspaper'] = X_train['TV'] * X_train['Newspaper']
X_train['Radio_Newspaper'] = X_train['Radio'] * X_train['Newspaper']

X_test['TV_Radio'] = X_test['TV'] * X_test['Radio']
X_test['TV_Newspaper'] = X_test['TV'] * X_test['Newspaper']
X_test['Radio_Newspaper'] = X_test['Radio'] * X_test['Newspaper']

# Retrain the model
lm_interaction = LinearRegression()
lm_interaction.fit(X_train, y_train)
y_pred_interaction = lm_interaction.predict(X_test)

# Evaluation
mse_interaction = mean_squared_error(y_test, y_pred_interaction)
rmse_interaction = mse_interaction**0.5
r2_interaction = r2_score(y_test, y_pred_interaction)

mse_interaction, rmse_interaction, r2_interaction


### Cross-validation

In [None]:

from sklearn.model_selection import cross_val_score

# Cross-validation
scores = cross_val_score(lm_interaction, X_train, y_train, cv=5, scoring='r2')
scores.mean(), scores.std()


### Correlation Analysis

In [None]:

# Correlation Analysis
correlation_matrix = X_train.corr()
correlation_matrix


### Accuracy Check

In [None]:

from sklearn.metrics import mean_absolute_error

# Calculating Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred_interaction)
mae
