In [None]:
import numpy as np
import pandas as pd
import sklearn
import pylab as plt
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import seaborn as sns

### Intro to Geo-Data and Machine Learning
This notebook introduces the use of geographic data with `geopandas`. It also uses the `auto-mpg` dataset to illustrate a typical ML work-flow: load data, handle missing values, exploratory visualization, build a model, evaluate the model.

#### Geographic data

Geopandas is similar to Pandas but allows manipulating of vector data: eg. national or subnational boundaries.
Here we will calculate the per capita income of countries in South America and plot it on a map.

In [None]:
! pip install geopandas

In [None]:
import geopandas as gpd

In [None]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

In [None]:
world.head()

In [None]:
world.plot()

In [None]:
world.continent.unique()

In [None]:
gdf_SA = world[world.continent == 'South America']

In [None]:
gdf_SA.columns

In [None]:
gdf_SA['gdp_per_capita'] = gdf_SA.gdp_md_est / gdf_SA.pop_est;

In [None]:
gdf_SA = gdf_SA[gdf_SA.pop_est > 10000]

In [None]:
gdf_SA.plot(column = 'gdp_per_capita', legend=True)
plt.title("Countries of South America by per capita income");

#### Analysis and machine learning
Here we will build a model to estimate fuel efficiency of cars based on their characteristics.

#### 1. Load and inspect dataset

In [None]:
mpg_url = "https://github.com/mwaskom/seaborn-data/raw/master/mpg.csv"

In [None]:
df = pd.read_csv(mpg_url)

In [None]:
df.head()

In [None]:
df.shape

#### 2. Missing values? Handle them.

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.shape

#### 3. Exploratory visualization

In [None]:
sns.pairplot(df)

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr())

#### 4. Train-test split

In [None]:
#Select Predictor columns
X = df[['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year']]

#Select target column
y = df['mpg']

In [None]:
#Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [None]:
print(X_train.shape)

In [None]:
print(X_test.shape)

#### 5. Train model(s)

In [None]:
# train a simple linear regression

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)


In [None]:
# what coefficients did it produce?

cols = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year']
list(zip(cols, regr.coef_))

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

#### 6. Evaluate model accuracy

In [None]:
#Mean Squared error and R-squared on the training set

preds_LR = regr.predict(X_train)
mse_LR = np.mean((preds - y_train) ** 2)
rsq_LR = regr.score(X_train, y_train)

print("Mean Squared Error: %.4f \n R-squared: %.4f" % (mse_LR,rsq_LR))

In [None]:
#Mean Squared error and R-squared on the training set

preds_RF = rf.predict(X_train)
mse_RF = np.mean((preds - y_train) ** 2)
rsq_RF = rf.score(X_train, y_train)

print("Mean Squared Error: %.4f \n R-squared: %.4f" % (mse_RF,rsq_RF))

In [None]:
outputs = pd.DataFrame({"linear regression model":rsq_LR,
             "random forest model":rsq_RF}, index = ['r-squared'])
outputs

In [None]:
outputs.plot(kind = 'bar', title = "Accuracy evaluation (r-squared)")