# linear regression in python

## Fixing Dr. Oak's pokedex

Dr Oak's pokedex has fell to the toilet.. <br> 
after leaving it in rice overnight, it is working again, but has a strange bug<br>
it now cannot identify the pokemon's HP.  
fortunatily Dr. Oak still has the full data for the original 150 pokemons.
help Dr. Oak create a predictive model to predict HP based on other pokemon charecteristics. 
<br>
![Alt text](images/pokedex.gif)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import copy

# new imports from sklearn!
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate


In [None]:
df = pd.read_csv('pokemon.csv', encoding ='latin1')
df.head()

## looking at the data
lets take a look at attack the joint distribution of HP and Attack.

In [None]:
sns.scatterplot(x='Attack', y='HP', data=df)
plt.show()

## training our first linear model

In [None]:
lm_1 = LinearRegression()
X = pd.DataFrame(df['Attack'])
y = pd.DataFrame(df['HP'])
lm_1.fit(X,y)
pred = lm_1.predict(X)
pred = [item[0] for item in pred]
df['pred_1'] = pred

sns.scatterplot(x='Attack', y='HP', data=df)
sns.scatterplot(x='Attack', y='pred_1', data=df)

plt.show()

In [None]:
df[df['HP']>=200]

![Alt text](images/Chansey.png)

## again without outliers

In [None]:
df_mod = df[df['HP']<=200].copy()
sns.scatterplot(x='Attack', y='HP', data=df_mod)
plt.show()

In [None]:
lm_2 = LinearRegression()
X = pd.DataFrame(df_mod['Attack'])
y = pd.DataFrame(df_mod['HP'])
lm_2.fit(X,y)
pred = lm_2.predict(X)
pred = [item[0] for item in pred]
df_mod['pred_2'] = pred



In [None]:
sns.scatterplot(x='Attack', y='HP', data=df_mod, label='real')
sns.scatterplot(x='Attack', y='pred_2', data=df_mod, label='prediction')
plt.legend()
plt.show()

## leverege points

In [None]:
sns.scatterplot(x='Attack', y='HP', data=df, label='real')
sns.scatterplot(x='Attack', y='pred_2', data=df_mod, label='prediction_2')
sns.scatterplot(x='Attack', y='pred_1', data=df, label='prediction_1')

plt.show()

In [None]:
print('intercept: {}'.format(lm_1.intercept_))
print('intercept: {}'.format(lm_1.coef_))
print('r^2: {}'.format(r2_score(df_mod['HP'], df['pred_1'][df['HP']<=200])))

In [None]:
print('intercept: {}'.format(lm_2.intercept_))
print('intercept: {}'.format(lm_2.coef_))
print('r^2: {}'.format(r2_score(df_mod['HP'], df_mod['pred_2'])))

## let's try Defense

In [None]:
sns.scatterplot(x='Defense', y='HP', data=df_mod)
plt.show()

In [None]:
lm_3 = LinearRegression()
X = pd.DataFrame(df_mod['Defense'])
y = pd.DataFrame(df_mod['HP'])
lm_3.fit(X,y)
pred = lm_3.predict(X)
pred = [item[0] for item in pred]
df_mod['pred_3'] = pred


In [None]:
sns.scatterplot(x='Defense', y='HP', data=df_mod, label='real')
sns.scatterplot(x='Defense', y='pred_3', data=df_mod, label = 'predition')
plt.legend()
plt.show()

In [None]:
print('intercept: {}'.format(lm_3.intercept_))
print('intercept: {}'.format(lm_3.coef_))
print('r^2: {}'.format(r2_score(df_mod['HP'], df_mod['pred_3'])))

In [None]:
lm_4 = LinearRegression()
X = pd.DataFrame(df_mod[['Defense','Attack']])
y = pd.DataFrame(df_mod['HP'])
lm_4.fit(X,y)
pred = lm_4.predict(X)
pred = [item[0] for item in pred]
df_mod['pred_4'] = pred


In [None]:
sns.scatterplot(x='Attack', y='HP', data=df_mod, label='real')
sns.scatterplot(x='Attack', y='pred_4', data=df_mod, label='prediction')
plt.legend()
plt.show()

sns.scatterplot(x='Defense', y='HP', data=df_mod, label='real')
sns.scatterplot(x='Defense', y='pred_4', data=df_mod, label='prediction')
plt.legend()
plt.show()



In [None]:
print('intercept: {}'.format(lm_4.intercept_))
print('intercept: {}'.format(lm_4.coef_))
print('r^2: {}'.format(r2_score(df_mod['HP'], df_mod['pred_4'])))

In [None]:
# Calculate correlations
corr = df.drop(['Name','Type_1', 'Type_2', 'Total', 'Stage', 'Legendary', 'pred_1'], axis=1).corr()
print(corr)
# Heatmap
sns.heatmap(corr,cmap="YlGnBu")
plt.show()

In [None]:
lm_5 = LinearRegression()
X = pd.DataFrame(df_mod[['Sp_Def','Attack']])
y = pd.DataFrame(df_mod['HP'])
lm_5.fit(X,y)
pred = lm_5.predict(X)
pred = [item[0] for item in pred]
df_mod['pred_5'] = pred

sns.scatterplot(x='Attack', y='HP', data=df_mod)
sns.scatterplot(x='Attack', y='pred_4', data=df_mod)

plt.show()

sns.scatterplot(x='Sp_Def', y='HP', data=df_mod)
sns.scatterplot(x='Sp_Def', y='pred_4', data=df_mod)

plt.show()


In [None]:
print('intercept: {}'.format(lm_5.intercept_))
print('intercept: {}'.format(lm_5.coef_))
print('r^2: {}'.format(r2_score(df_mod['HP'], df_mod['pred_5'])))

## why not just throw in all possible predictors?

In [None]:
lm_6 = LinearRegression()
X = pd.DataFrame(df_mod[['Attack', 'Defense', 'Sp_Atk', 'Sp_Def', 'Speed']])
y = pd.DataFrame(df_mod['HP'])
lm_6.fit(X,y)
pred = lm_6.predict(X)
pred = [item[0] for item in pred]
df_mod['pred_6'] = pred


print('intercept: {}'.format(lm_6.intercept_))
print('intercept: {}'.format(lm_6.coef_))
print('r^2: {}'.format(r2_score(df_mod['HP'], df_mod['pred_6'])))

### train-test split, and rmse comparison

In [None]:
X = pd.DataFrame(df_mod[['Attack', 'Defense', 'Sp_Atk', 'Sp_Def', 'Speed']])
y = pd.DataFrame(df_mod['HP'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

lm_7 = LinearRegression()
lm_7.fit(X_train,y_train)

pred_train = lm_7.predict(X_train)
pred_train = [item[0] for item in pred_train]
train_rmse = mean_squared_error(y_train, pred_train)**0.5

pred_test = lm_7.predict(X_test)
pred_test = [item[0] for item in pred_test]
test_rmse = mean_squared_error(y_test, pred_test)**0.5

print('train rmse: {}'.format(train_rmse))
print('test rmse: {}'.format(test_rmse))

In [None]:
X = pd.DataFrame(df_mod[['Attack', 'Defense', 'Sp_Atk', 'Sp_Def']])
y = pd.DataFrame(df_mod['HP'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

lm_8 = LinearRegression()
lm_8.fit(X_train,y_train)

pred_train = lm_8.predict(X_train)
pred_train = [item[0] for item in pred_train]
train_rmse = mean_squared_error(y_train, pred_train)**0.5

pred_test = lm_8.predict(X_test)
pred_test = [item[0] for item in pred_test]
test_rmse = mean_squared_error(y_test, pred_test)**0.5

print('train rmse: {}'.format(train_rmse))
print('test rmse: {}'.format(test_rmse))

### cross validation

In [None]:
X = pd.DataFrame(df_mod[['Attack', 'Defense', 'Sp_Atk', 'Sp_Def', 'Speed']])
y = pd.DataFrame(df_mod['HP'])

lm_9 = LinearRegression()
cv_results = cross_validate(lm_9, X, y, cv=3)

In [None]:
cv_results

In [None]:
np.mean(cv_results['test_score'])

In [None]:
X = pd.DataFrame(df_mod[['Attack', 'Defense', 'Sp_Atk', 'Sp_Def']])
y = pd.DataFrame(df_mod['HP'])

lm_9 = LinearRegression()
cv_results = cross_validate(lm_9, X, y, cv=3)
np.mean(cv_results['test_score'])

# Fixed!
your model has been tested and Dr. Oak decided to use it to fix the pokedex!<br>
the world is now once again whole.<br>
until next time Dr. oak plays pokemon on the toilet.....