In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
if not os.path.exists("plots"):
    os.makedirs("plots")


data = pd.read_csv("Advertising.csv")


print(data.head())
print(data.describe())

sns.heatmap(data.corr(), annot=True, cmap="Blues")
plt.show()
plt.savefig("plots/1_correlation_heatmap.png")
plt.close()




In [21]:
data = data.drop_duplicates()

print("Missing values:\n", data.isnull().sum())

for col in ['TV','Radio','Newspaper','Sales']:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    L = Q1 - 1.5 * IQR
    U = Q3 + 1.5 * IQR
    print(f"{col} Outlier Range: <{L} or >{U}")


In [22]:
data['Total_Ad'] = data['TV'] + data['Radio'] + data['Newspaper']

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_scaled = scaler.fit_transform(data[['TV','Radio','Newspaper','Total_Ad']])


In [23]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

X = data[['TV','Radio','Newspaper','Total_Ad']]
y = data['Sales']

selector = SelectKBest(score_func=mutual_info_regression, k='all')
selector.fit(X, y)

feature_scores = pd.Series(selector.scores_, index=X.columns)
print("Feature Scores:\n", feature_scores.sort_values(ascending=False))


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr = LinearRegression()
rf = RandomForestRegressor()

lr.fit(X_train_scaled, y_train)
rf.fit(X_train_scaled, y_train)


In [25]:
from sklearn.metrics import r2_score, mean_squared_error


lr_pred = lr.predict(X_test_scaled)
rf_pred = rf.predict(X_test_scaled)

print("Linear Regression R2:", r2_score(y_test, lr_pred))
print("Random Forest R2:", r2_score(y_test, rf_pred))

plt.scatter(y_test, lr_pred)
plt.xlabel("Actual Sales")
plt.ylabel("Predicted Sales")
plt.title("Actual vs Predicted Sales")
plt.show()
plt.savefig("plots/2_actual_vs_predicted_lr.png")
plt.close()




In [26]:
lr2 = LinearRegression()
lr2.fit(X, y)

coef = pd.Series(lr2.coef_, index=X.columns)
print("Impact of Advertising (Coefficients):\n", coef)

X_inc = X.copy()
X_inc['TV'] = X_inc['TV'] * 1.10
pred_base = lr2.predict(X)
pred_inc = lr2.predict(X_inc)

print("Sales Change if TV +10%:", pred_inc.mean() - pred_base.mean())


In [27]:
roi = coef / X.mean()
roi = roi.sort_values(ascending=False)
print("ROI Ranking:\n", roi)

print("\nBusiness Insights:")
print(f"- Best performing channel: {roi.index[0]}")
print("- Increase budget on highest ROI channel.")
print("- Reduce spending on low-impact channels like Newspaper.")
print("- Use optimized ad mix to maximize sales growth.")
