In [None]:
#IBRAHIM SULTAN
#OASIS INFOBYTE
#TASK 5 SALES PREDICTION USING PYTHON

In [None]:
## Imporint Necessary Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as sm

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

In [None]:
## Reading CSV file


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

In [None]:
df = pd.read_csv('/kaggle/input/advertisingcsv/Advertising.csv')

### Exploratory Data Analysis

In [None]:
df.head()

In [None]:
df.columns

In [None]:
## Rename the column for better understanding
df.rename(columns = {'Unnamed: 0' : 'Index'}, inplace = True)

In [None]:
df.head()

In [None]:
# Check information about the dataset, data types, and null values

df.info()

In [None]:
## Statistical Summary of the numerical columns

df.describe().T

In [None]:
# Checking for missing values in the datset

df.isnull().sum()

### Finding Outliers

In [None]:
sns.boxplot(data=df,orient='h')

In [None]:
sns.boxplot(data=df['Newspaper'],orient='h')

### Removing Outliers Using IQR

In [None]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3-Q1

In [None]:
lower_bound = Q1 - 1.5*IQR
upper_bound = Q3 + 1.5*IQR

In [None]:
df1 = df[(df >= lower_bound) & (df <= upper_bound) | df.isnull()]

In [None]:
df1.head()

In [None]:
## checking missing values

df1.isnull().sum()

In [None]:
## fill the null values with it's mean

df1['Newspaper'] = df1['Newspaper'].fillna(df1['Newspaper'].mean())

In [None]:
df1.isnull().sum()

In [None]:
## Removing unnecessary Columns permanaently

df1.drop('Index',axis=1,inplace=True)

In [None]:
df1.describe().T

In [None]:
sns.boxplot(data=df1,orient='h')

In [None]:
## Scatter plots to check the linearity assumption between each independent variable and dependent variables

sns.pairplot(df1, x_vars=["TV","Radio","Newspaper"], y_vars=["Sales"], kind="reg")

In [None]:
## Histograms to check the normality assumption of the dependent variable (Sales)

df1.hist(bins=10)

In [None]:
## To see how the data distributed

sns.histplot(df1['Sales'],kde='True')

In [None]:
sns.lmplot(x='TV',y='Sales',data=df1)
sns.lmplot(x='TV',y='Newspaper',data=df1)
sns.lmplot(x='TV',y='Radio',data=df1)

In [None]:
## To check Correlation between independent variables and dependent variables

corrmat = df1.corr()
f, ax = plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmin=0, vmax=1, square=True, ax=ax)
plt.show()

In [None]:
## Model Preparation

X = df1.drop('Sales',axis=1)
Y = df1[["Sales"]]

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.20,random_state=46)

In [None]:
## Linear regression model

linear_model = sm.ols(formula="Sales ~ TV + Radio + Newspaper", data=df1 ).fit()

In [None]:
## Print the coeffiecints of the linear model

print(linear_model.params, "\n")

In [None]:
## Print the summary of the linear regression model

print(linear_model.summary())

In [None]:
results = []
names = []

In [None]:
## list of models to evaluate

models = [('LinearRegression', LinearRegression())]

In [None]:
## Loop through each model, fit it to the data, and calculate the RMSE

for name, model in models:
    model.fit(X_train,Y_train)
    Y_pred = model.predict(X_test)
    result = np.sqrt(mean_squared_error(Y_test,Y_pred))
    results.append(result)
    names.append(name)
    msg = "%s: %f" % (name, result)
    print(msg)

### Make Prediction on New data

In [None]:
new_data = pd.DataFrame({'TV':[230.1],'Radio':[37.8], 'Newspaper': [69.2]})
predicted_sales = linear_model.predict(new_data)
print("Predicted Sales:", predicted_sales)

In [None]:
new_data = pd.DataFrame({'TV':[500],'Radio':[250], 'Newspaper': [125]})
predicted_sales = linear_model.predict(new_data)
print("Predicted Sales:", predicted_sales)