## Import Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Config

In [2]:
# Dependent variable column name
y_header = 'Life expectancy'

## Import CSV dataset

In [3]:
dataset = pd.read_csv('LifeExpectancyData.csv')
# .values to remove headers and row numbers
X = dataset.loc[:, dataset.columns != y_header].values
y = dataset.loc[:, y_header].values

In [4]:
print(X)

[['Afghanistan' 'Developing' 2015 ... 17.3 0.479 10.1]
 ['Afghanistan' 'Developing' 2014 ... 17.5 0.47600000000000003 10.0]
 ['Afghanistan' 'Developing' 2013 ... 17.7 0.47 9.9]
 ...
 ['Zimbabwe' 'Developing' 2002 ... 1.3 0.42700000000000005 10.0]
 ['Zimbabwe' 'Developing' 2001 ... 1.7 0.42700000000000005 9.8]
 ['Zimbabwe' 'Developing' 2000 ... 11.2 0.434 9.8]]


In [6]:
print(y)

[65.  59.9 59.9 ... 44.8 45.3 46. ]


## Handling Missing Data (independent variables)

In [7]:
# Example of row with missing data
print(X[2379])

['Somalia' 'Developing' 2013 318.0 51 0.01 0.0 42.0 3173 23.3 81 47.0 nan
 42.0 0.8 47.5432354 nan 6.8 6.6 nan nan]


In [8]:
from sklearn.impute import SimpleImputer
# missing_values are which values should be to replace. Strategy is method used to replace them
# Could also replace using median, etc
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# fit method calculates which values to replace and their replacement values
# Can only pass numerical values since we're using a mean
imputer.fit(X[:, 3:])
# transform applies these values to the dataset. Pass same values as column 
X[:, 3:] = imputer.transform(X[:, 3:])

In [9]:
# Example of row with replaced missing data
print(X[2379])

['Somalia' 'Developing' 2013 318.0 51.0 0.01 0.0 42.0 3173.0 23.3 81.0
 47.0 5.9381895280235995 42.0 0.8 47.5432354 12753375.120052494 6.8 6.6
 0.6275510645976182 11.992792792792793]


## Handling Missing Data (dependent variable)

In [10]:
print(y[2216])

nan


In [11]:
import numbers

# Working on assumption that non-imputable datatypes (string, etc) will have all their values present.

if isinstance(y[0], numbers.Number):
    print("Performing impute for dependent varaible")
    y = y.reshape(-1, 1)
    y_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    y_imputer.fit(y)
    y = y_imputer.transform(y)

Performing impute for dependent varaible


In [12]:
print(y[2216])

[69.22493169]


## Encoding Labels (Binary)

In [13]:
print(X[:,1])

['Developing' 'Developing' 'Developing' ... 'Developing' 'Developing'
 'Developing']


In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# performs both the fit and transform in one function
X[:,1] = le.fit_transform(X[:,1])

In [15]:
print(X[:,1])

[1 1 1 ... 1 1 1]


## Encoding Labels (One-Hot)

In [16]:
print(X[:,0])

['Afghanistan' 'Afghanistan' 'Afghanistan' ... 'Zimbabwe' 'Zimbabwe'
 'Zimbabwe']


In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# transformers specifies which processes to do and which indexes of columns to perform them on 
# (0 in this case for country column)
# remainder='passthrough' means to keep the other columns that aren't having transformations applied on them
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
# Fit and apply one-hot encoding to our dataset
# One hot encoding adds columns for each item in encoding, e.g. [0,0,1] has 0 in column 1, 0 col 2, 1 col 3.

X = ct.fit_transform(X)

In [18]:
# Large number of onehot columns (193) so print output is indexed in (row, column) notation for this row
print(X[1])

  (0, 0)	1.0
  (0, 193)	1.0
  (0, 194)	2014.0
  (0, 195)	271.0
  (0, 196)	64.0
  (0, 197)	0.01
  (0, 198)	73.52358168
  (0, 199)	62.0
  (0, 200)	492.0
  (0, 201)	18.6
  (0, 202)	86.0
  (0, 203)	58.0
  (0, 204)	8.18
  (0, 205)	62.0
  (0, 206)	0.1
  (0, 207)	612.696514
  (0, 208)	327582.0
  (0, 209)	17.5
  (0, 210)	17.5
  (0, 211)	0.47600000000000003
  (0, 212)	10.0


## Splitting data into training and testing datasets

In [19]:
from sklearn.model_selection import train_test_split
# test_size is proportion of dataset split for test dataset (20% test).
# random_state is the random seed. Want this to be constant so we get the same dataset split each time we run this training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [20]:
print(X_train)

  (0, 65)	1.0
  (0, 193)	1.0
  (0, 194)	2009.0
  (0, 195)	271.0
  (0, 196)	41.0
  (0, 197)	1.76
  (0, 198)	30.72290501
  (0, 199)	94.0
  (0, 200)	101.0
  (0, 201)	24.9
  (0, 202)	60.0
  (0, 203)	94.0
  (0, 204)	5.17
  (0, 205)	94.0
  (0, 206)	2.1
  (0, 207)	186.76538
  (0, 208)	2393831.0
  (0, 209)	7.3
  (0, 210)	7.2
  (0, 211)	0.542
  (0, 212)	10.4
  (1, 161)	1.0
  (1, 193)	1.0
  (1, 194)	2005.0
  (1, 195)	138.0
  :	:
  (2348, 208)	973146.0
  (2348, 209)	2.4
  (2348, 210)	2.5
  (2348, 211)	0.703
  (2348, 212)	14.1
  (2349, 68)	1.0
  (2349, 193)	1.0
  (2349, 194)	2012.0
  (2349, 195)	189.0
  (2349, 196)	11.0
  (2349, 197)	2.02
  (2349, 198)	484.7187892
  (2349, 199)	96.0
  (2349, 201)	48.6
  (2349, 202)	13.0
  (2349, 203)	94.0
  (2349, 204)	6.33
  (2349, 205)	96.0
  (2349, 206)	0.4
  (2349, 207)	3299.65139
  (2349, 208)	1527156.0
  (2349, 209)	1.2
  (2349, 210)	1.2
  (2349, 211)	0.616
  (2349, 212)	10.6


In [21]:
print(X_test)

  (0, 81)	1.0
  (0, 193)	1.0
  (0, 194)	2013.0
  (0, 195)	61.0
  (0, 196)	1.0
  (0, 197)	2.69
  (0, 198)	4279.895512
  (0, 199)	97.0
  (0, 200)	50.0
  (0, 201)	64.2
  (0, 202)	1.0
  (0, 203)	96.0
  (0, 204)	7.89
  (0, 205)	96.0
  (0, 206)	0.1
  (0, 207)	36393.66932
  (0, 208)	8595.0
  (0, 209)	1.2
  (0, 210)	1.1
  (0, 211)	0.8909999999999999
  (0, 212)	15.9
  (1, 179)	1.0
  (1, 193)	1.0
  (1, 194)	2010.0
  (1, 195)	362.0
  :	:
  (586, 209)	6.6
  (586, 210)	6.6
  (586, 211)	0.42700000000000005
  (586, 212)	11.6
  (587, 95)	1.0
  (587, 193)	1.0
  (587, 194)	2008.0
  (587, 195)	281.0
  (587, 196)	10.0
  (587, 197)	3.88
  (587, 198)	40.03344077
  (587, 199)	64.0
  (587, 200)	1.0
  (587, 201)	23.4
  (587, 202)	14.0
  (587, 203)	79.0
  (587, 204)	11.83
  (587, 205)	75.0
  (587, 206)	2.4
  (587, 207)	232.61732
  (587, 208)	3662993.0
  (587, 209)	7.7
  (587, 210)	7.6
  (587, 211)	0.39399999999999996
  (587, 212)	9.8


In [22]:
print(y_train)

[[66. ]
 [74.2]
 [77.9]
 ...
 [56.3]
 [68.2]
 [71.3]]


In [23]:
print(y_test)

[[82.1       ]
 [58.4       ]
 [63.3       ]
 [67.9       ]
 [47.8       ]
 [89.        ]
 [67.7       ]
 [72.5       ]
 [71.9       ]
 [78.9       ]
 [73.        ]
 [83.        ]
 [52.2       ]
 [57.3       ]
 [72.3       ]
 [59.2       ]
 [75.3       ]
 [72.5       ]
 [61.9       ]
 [53.2       ]
 [72.2       ]
 [69.22493169]
 [71.8       ]
 [55.9       ]
 [79.9       ]
 [79.8       ]
 [72.1       ]
 [68.9       ]
 [68.8       ]
 [51.1       ]
 [69.3       ]
 [54.9       ]
 [73.6       ]
 [73.4       ]
 [78.9       ]
 [76.1       ]
 [67.8       ]
 [81.1       ]
 [74.4       ]
 [72.1       ]
 [75.2       ]
 [80.        ]
 [71.6       ]
 [59.4       ]
 [68.4       ]
 [72.9       ]
 [62.5       ]
 [67.8       ]
 [73.6       ]
 [50.        ]
 [43.3       ]
 [73.8       ]
 [72.6       ]
 [48.5       ]
 [71.9       ]
 [65.6       ]
 [65.9       ]
 [73.1       ]
 [61.8       ]
 [67.5       ]
 [51.4       ]
 [47.1       ]
 [59.2       ]
 [66.6       ]
 [77.3       ]
 [74.        ]
 [73.4    

## Simple Linear Regression (1 independent variable)

In [None]:
# TODO

## Multiple Linear Regression

### Training the linear regression model

In [35]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

### Predictions based off testing data

In [36]:
y_pred = regressor.predict(X_test)

In [42]:
np.set_printoptions(precision=2)
y_predicted = y_pred.reshape(len(y_pred),1)
y_testing = y_test.reshape(len(y_test),1)
print(np.concatenate((y_predicted, y_testing),1))

[[79.5  82.1 ]
 [58.13 58.4 ]
 [67.74 63.3 ]
 ...
 [71.76 73.1 ]
 [54.5  51.3 ]
 [63.32 58.6 ]]
