In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import plotly.express as py
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot
import cufflinks as cf
init_notebook_mode(connected=True)
cf.go_offline()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.formula.api as sm

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('Salary_Data.csv')
df.head()

In [None]:
df.shape

In [None]:
x = df.iloc[:, :1]
y = df.iloc[:, 1:]

In [None]:
x.head(1)

In [None]:
y.head(1)

In [None]:
plot = py.scatter(df, x='YearsExperience', y='Salary', trendline='ols',
        labels={'Salary': 'Salary', 'YearsExperience': 'Years Experience'})
plot.show()

In [None]:
dist = ff.create_distplot([df['YearsExperience']], ['Years Experience'])
dist.show()

In [None]:
df['Salary'].iplot(kind='hist', color='blue')

In [None]:
df.iplot(kind='bar', x='YearsExperience', y='Salary')

In [None]:
df['Salary'].iplot(kind='box', color='blue')

In [None]:
r2socreList = []

def getRScore():
    return r2socreList

def addRScore(modelNum, value, intercept, mse):
    obj = {
        'model': modelNum,
        'rsquared': round(value, 3),
#         'intercept': round(intercept, 3),
        'mse': round(mse, 3)
    }
    r2socreList.append(obj)

### Using OLS Model

In [None]:
model1 = sm.ols("Salary~YearsExperience", data=df).fit()
mse = mean_squared_error(y, model1.predict())
addRScore('model1', model1.rsquared, model1.pvalues.Intercept, mse)
# model1.summary()

In [None]:
plot=py.scatter(df, x='YearsExperience', y='Salary', labels={'Salary': 'Salary', 'YearsExperience': 'Years Experience'})
plot.add_traces(go.Scatter(x=df['YearsExperience'], y=model1.predict(), name='Regression Fit', mode="markers"))
plot.show()

In [None]:
model2 = sm.ols("Salary~np.log(YearsExperience)",data=df).fit()
mse = mean_squared_error(y, model2.predict())
addRScore('model2', model2.rsquared, model2.pvalues.Intercept, mse)

In [None]:
plot=py.scatter(df, x='YearsExperience', y='Salary', labels={'Salary': 'Salary', 'YearsExperience': 'Years Experience'})
plot.add_traces(go.Scatter(x=df['YearsExperience'], y=model2.predict(), name='Regression Fit', mode="markers"))
plot.show()

In [None]:
model3 = sm.ols("Salary~np.sqrt(YearsExperience)",data=df).fit()
mse = mean_squared_error(y, model3.predict())
addRScore('model3', model3.rsquared, model3.pvalues.Intercept, mse)

In [None]:
model4 = sm.ols("Salary~np.cbrt(YearsExperience)",data=df).fit()
mse = mean_squared_error(y, model4.predict())
addRScore('model4', model4.rsquared, model4.pvalues.Intercept, mse)

In [None]:
model5=sm.ols("Salary~YearsExperience+pow(YearsExperience,2)",data=df).fit()
mse = mean_squared_error(y, model5.predict())
addRScore('model5', model5.rsquared, model5.pvalues.Intercept, mse)

In [None]:
plot=py.scatter(df, x='YearsExperience', y='Salary', labels={'Salary': 'Salary', 'YearsExperience': 'Years Experience'})
plot.add_traces(go.Scatter(x=df['YearsExperience'], y=model5.predict(), name='Regression Fit', mode="markers"))
plot.show()

In [None]:
getRScore()

### Looking like Model 1 is Better

In [None]:
df['predicted'] = model1.fittedvalues

In [None]:
df.head(1)

### Prediction Using New Data Point

In [None]:
newdata = pd.Series([11, 12, 13, 15, 20])
newdf = pd.DataFrame(newdata, columns = ['YearsExperience'])
newdf

In [None]:
newdf['Predicted'] = model1.predict(newdf)
newdf