In [1]:
import pandas as pd

data = {
    'tv': [13000.0, 41000.0, 23000.0, 54000.0, 12000.0],
    'radio': [9237.76, 15886.45, 10500.34, 12000.56, 8500.23],
    'social_media': [2409.57, 2913.41, 3100.89, 4300.45, 2000.56],
    'sales': [46677.90, 150177.83, 89000.50, 210000.75, 35000.40]
}
new_data = {
    'tv': [35000.0, 15000.0, 30000.0, 50000.0, 18000.0],
    'radio': [11000.34, 7200.87, 13300.56, 9500.34, 8800.65],
    'social_media': [3200.56, 2700.45, 4100.67, 3700.78, 2200.45],
    'sales': [120000.50, 45000.75, 134000.80, 165000.90, 57000.45]
}

# Combine original data and new data
for key in data.keys():
    data[key].extend(new_data[key])

sales_df=pd.DataFrame(data)

#Creating features(X) and target array(y)
X=sales_df.drop("sales",axis=1).values
y=sales_df["sales"].values

print(X.shape,y.shape)

#make valid shape for X

#X.shape

(10, 3) (10,)


In [2]:
#Building a linear regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.3, random_state=42)
scaler =StandardScaler()

X_train_scaled= scaler.fit_transform(X_train)
X_test_scaled= scaler.transform(X_test)

reg=LinearRegression()
reg.fit(X_train_scaled, y_train)
y_pred=reg.predict(X_test_scaled)

print("Predictions :{}, Actual Values:{}".format(y_pred[:2], y_test[:2]))


Predictions :[180018.88017437 171859.6305146 ], Actual Values:[165000.9  150177.83]


In [4]:
#computing model performance

from sklearn.metrics import mean_squared_error, r2_score

r_squared=reg.score(X_test_scaled, y_test)

# or r2_score(X_test,y_test)

rmse=mean_squared_error(y_test, y_pred, squared=False)

print("R^2: {}".format(r_squared))
print("RMSE: {}".format(rmse))

R^2: 0.18575115703404943
RMSE: 16896.106573466062


In [6]:
#Cross-validation for more accuracy of performance on test set
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
kf= KFold(n_splits=5, shuffle=True, random_state=5)
cv_result=cross_val_score(reg, X,y , cv=kf)
print(cv_result)
print("Mean : {}, Standard deviation : {}, 95% confident interval {}".format(np.mean(cv_result), np.std(cv_result), np.quantile(cv_result, [0.025, 0.975])))

[ 0.97520242  0.98746319 -1.30361043  0.97468952  0.9769909 ]
Mean : 0.5221471192252671, Standard deviation : 0.9128906051930629, 95% confident interval [-1.07578044  0.98641596]
