In [None]:
#Importing basic packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [None]:
#Importing the Dataset
#Cals list of top 30 countries with X variables
thirty = pd.read_csv('C:\\Users\\D648007\\CalTop30Countries.csv', encoding= 'unicode_escape')
dataset = thirty

In [None]:
#Look at dataset's dimensions: rows and columns
print(dataset.shape)

In [None]:
dataset.dtypes

In [None]:
#Convert Edu from object to float dtype
dataset["Education"] = pd.to_numeric(dataset.Education, errors='coerce')
#Check that conversion has worked
dataset.dtypes

In [None]:
#View first 15 rows to see what the data is like
#Top 30 RICH
#Spend 2020 - 
#Tax 2020 -
#Gini WHR -
#Mobility - 
#GDP - 
#Pollution - 
#Democracy - 
#Giving - 
#Obesity - 
#Min wage - 
#Gini - 
#Education - 
#Work - 
#Tourism - 
#Ladder (2020) - Happiness score OUTCOME VARIABLE
dataset.head(15)


In [None]:
#Confirm name of column with the country names in it as we want to remove it
dataset.columns

In [None]:
#Drop the column with the country names
dataset = dataset.drop(['TOP 30 RICH'], axis=1)
#Check that the column has been dropped 
dataset.head(5)

In [None]:
#Change column names to shorten them so are easier to read in tables
dataset.columns = ['Spen', 'Tax', 'GinW', 'Mob', 'GDP', 'Pol', 'Dem', 'Giv', 'Obe', 'MinW', 'Gin', 'Edu', 'Work', 'Tour', 'Hap' ]

In [None]:
#Identify which columns have NaN (Not a Number - i.e. missing values)
nan_values = dataset.isna()
nan_columns = nan_values.any()

columns_with_nan = dataset.columns[nan_columns].tolist()
print(columns_with_nan)

#Need to remove the NaN

In [None]:
#Chart the features to see what % of each feature is missing, using the NaN
import missingno as msno
msno.bar(dataset)

In [None]:
#Replace the NaNs with the median for that feature
dataset = dataset.fillna(dataset.median())
#Alternative approach would be to remove the rows with the data gaps (with the NaN)

In [None]:
#Get basic descriptive stats of the dataset to understand mean, median, standard dev, range
dataset.describe()

In [None]:
#Univariate histograms show us the distribution of each feature
dataset.hist(bins=30, figsize=(20,15))
plt.show()

In [None]:
#To inspect the correlation between different features, i.e. how they may or may not change with each other. 
#Values range from -1 to 1 and a value of 0 means no correlation at all.
#Pearson's correlation assumes a normal distribution of the features.
#Some ML algorithms don't work well if the data is highly correlated.
from pandas import set_option
set_option('display.width', 150)
set_option('precision', 2)
corr = dataset.corr(method='pearson')
print(corr)

In [None]:
#View correlations as a heatmap matrix, lighter colours = more correlation
plt.figure(figsize = (15,15))
sns.heatmap(corr, annot=True)

In [None]:
#Scatterplot for the data with the medians attached
sns.pairplot(dataset, height = 1.5,)
#If classification, can use sns.pairplot(dataset, hue = '(insert name of Y variable)') to colour dots based on class)

In [None]:
#Preparing the data for the decision tree model
#Separate the dataset into dependent (x) and independent (y, has diabetes) components, starting with converting to array
array = dataset.values

X = array[:, 0:14]
Y = array[:, 14]


In [None]:
#Split into training and test data
# We train the model on the training data and then test how good it is using the unseen test data
#from sklearn.model_selection import train_test_split
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state=5)

In [None]:
print(X.shape)

In [None]:
print(Y.shape)

In [None]:
#Fit model #1 Decision Tree
from sklearn.tree import DecisionTreeRegressor
dt3 = DecisionTreeRegressor(max_depth = 3, min_samples_leaf = 2)
dt3.fit(X, Y)

In [None]:
#Test decision Tree model accuracy using test dataset
#resultdt = dt.score(X_test, Y_test)
#print("Test Set Accuracy of Decision Tree model: %.3f%%" % (resultdt*100.0))

In [None]:
#Plot confusion matrix to see what proportion of diabetes cases are predicted correctly
#from sklearn.metrics import plot_confusion_matrix
#matrixdt = plot_confusion_matrix(dt, X_test, Y_test, cmap=plt.cm.Blues)
#plt.show(matrixdt)

In [None]:
#Plot Decision Tree model as a tree using sklearn.tree, plot_tree
from sklearn.tree import plot_tree
fig, ax = plt.subplots(figsize=(15,10))
fn = ['Spen', 'Tax', 'GinW', 'Mob', 'GDP', 'Pol', 'Dem', 'Giv', 'Obe', 'MinW', 'Gin', 'Edu', 'Work', 'Tour']
plot_tree(dt3, filled=True, ax=ax, feature_names = fn, rounded=True, precision = 2)
plt.show()

In [None]:
dt3.feature_importances_

In [None]:
#Fit model #1 Decision Tree
from sklearn.tree import DecisionTreeRegressor
dt4 = DecisionTreeRegressor(max_depth = 4, min_samples_leaf = 2)
dt4.fit(X, Y)

In [None]:
#Plot Decision Tree model as a tree using sklearn.tree, plot_tree
from sklearn.tree import plot_tree
fig, ax = plt.subplots(figsize=(20,10))
fn = ['Spen', 'Tax', 'GinW', 'Mob', 'GDP', 'Pol', 'Dem', 'Giv', 'Obe', 'MinW', 'Gin', 'Edu', 'Work', 'Tour']
plot_tree(dt4, filled=True, ax=ax, feature_names = fn, rounded=True, precision = 2, fontsize=8)
plt.show()

In [None]:
dt4.feature_importances_