In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import plotly.express as py
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot
import cufflinks as cf
init_notebook_mode(connected=True)
cf.go_offline()

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.formula.api as sm

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('Salary_Data.csv')
df.head()

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0


In [6]:
df.shape

(30, 2)

In [7]:
x = df.iloc[:, :1]
y = df.iloc[:, 1:]

In [8]:
x.head(1)

Unnamed: 0,YearsExperience
0,1.1


In [9]:
y.head(1)

Unnamed: 0,Salary
0,39343.0


In [10]:
plot = py.scatter(df, x='YearsExperience', y='Salary', trendline='ols',
        labels={'Salary': 'Salary', 'YearsExperience': 'Years Experience'})
plot.show()

In [11]:
dist = ff.create_distplot([df['YearsExperience']], ['Years Experience'])
dist.show()

In [12]:
df['Salary'].iplot(kind='hist', color='blue')

In [13]:
df.iplot(kind='bar', x='YearsExperience', y='Salary')

In [14]:
df['Salary'].iplot(kind='box', color='blue')

In [15]:
r2socreList = []

def getRScore():
    return r2socreList

def addRScore(modelNum, value, intercept, mse):
    obj = {
        'model': modelNum,
        'rsquared': round(value, 3),
#         'intercept': round(intercept, 3),
        'mse': round(mse, 3)
    }
    r2socreList.append(obj)

### Using OLS Model

In [16]:
model1 = sm.ols("Salary~YearsExperience", data=df).fit()
mse = mean_squared_error(y, model1.predict())
addRScore('model1', model1.rsquared, model1.pvalues.Intercept, mse)
# model1.summary()

In [17]:
plot=py.scatter(df, x='YearsExperience', y='Salary', labels={'Salary': 'Salary', 'YearsExperience': 'Years Experience'})
plot.add_traces(go.Scatter(x=df['YearsExperience'], y=model1.predict(), name='Regression Fit', mode="markers"))
plot.show()

In [18]:
model2 = sm.ols("Salary~np.log(YearsExperience)",data=df).fit()
mse = mean_squared_error(y, model2.predict())
addRScore('model2', model2.rsquared, model2.pvalues.Intercept, mse)

In [19]:
plot=py.scatter(df, x='YearsExperience', y='Salary', labels={'Salary': 'Salary', 'YearsExperience': 'Years Experience'})
plot.add_traces(go.Scatter(x=df['YearsExperience'], y=model2.predict(), name='Regression Fit', mode="markers"))
plot.show()

In [20]:
model3 = sm.ols("Salary~np.sqrt(YearsExperience)",data=df).fit()
mse = mean_squared_error(y, model3.predict())
addRScore('model3', model3.rsquared, model3.pvalues.Intercept, mse)

In [21]:
model4 = sm.ols("Salary~np.cbrt(YearsExperience)",data=df).fit()
mse = mean_squared_error(y, model4.predict())
addRScore('model4', model4.rsquared, model4.pvalues.Intercept, mse)

In [22]:
model5=sm.ols("Salary~YearsExperience+pow(YearsExperience,2)",data=df).fit()
mse = mean_squared_error(y, model5.predict())
addRScore('model5', model5.rsquared, model5.pvalues.Intercept, mse)

In [23]:
plot=py.scatter(df, x='YearsExperience', y='Salary', labels={'Salary': 'Salary', 'YearsExperience': 'Years Experience'})
plot.add_traces(go.Scatter(x=df['YearsExperience'], y=model5.predict(), name='Regression Fit', mode="markers"))
plot.show()

In [24]:
getRScore()

[{'model': 'model1', 'rsquared': 0.957, 'mse': 31270951.722},
 {'model': 'model2', 'rsquared': 0.854, 'mse': 106149618.722},
 {'model': 'model3', 'rsquared': 0.931, 'mse': 50127755.617},
 {'model': 'model4', 'rsquared': 0.911, 'mse': 64433306.68},
 {'model': 'model5', 'rsquared': 0.957, 'mse': 31257508.451}]

### Looking like Model 1 is Better

In [25]:
df['predicted'] = model1.fittedvalues

In [26]:
df.head(1)

Unnamed: 0,YearsExperience,Salary,predicted
0,1.1,39343.0,36187.158752


### Prediction Using New Data Point

In [27]:
newdata = pd.Series([11, 12, 13, 15, 20])
newdf = pd.DataFrame(newdata, columns = ['YearsExperience'])
newdf

Unnamed: 0,YearsExperience
0,11
1,12
2,13
3,15
4,20


In [28]:
newdf['Predicted'] = model1.predict(newdf)
newdf

Unnamed: 0,YearsExperience,Predicted
0,11,129741.785735
1,12,139191.748056
2,13,148641.710378
3,15,167541.63502
4,20,214791.446628
