In [1]:
import pandas as pd
import numpy as np
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier

world = pd.read_csv('world.csv',encoding = 'ISO-8859-1')
life = pd.read_csv('life.csv',encoding = 'ISO-8859-1')

# outer join the two dataframe
df2 = pd.merge(world[:-5], life, on='Country Code', how='outer')

# remove the data for the countries with do not have a class label
df2 = df2[df2['Life expectancy at birth (years)'].notna()]

#get just the features
data = df2[list(df2.columns)[3:23]]

#get just the class labels
classlabel=df2['Life expectancy at birth (years)']

In [2]:
#randomly select 66% of the instances to be training and the rest to be testing
X_train, X_test, y_train, y_test = train_test_split(data,classlabel, train_size=2/3, test_size=1/3, random_state=100)

def medium_impute(small_df):
    ''' function that passes a dataframe to apply medium imputation on it'''
    # median imputation to impute missing values for each feature
    for column in small_df.columns:
        # replace nan cells as empty string
        small_df[column].replace(['...','..'], np.nan, inplace=True)
        df_median = small_df[column].median()
        small_df[column].replace(np.nan, df_median, inplace=True)

    # convert all data into numeric    
    small_df = small_df.apply(pd.to_numeric, errors='coerce')  
    #small_df = small_df.astype(float)
    return small_df

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

X_train = medium_impute(X_train).copy()
X_test = medium_impute(X_test).copy()

#normalise the data to have 0 mean and unit variance using the library functions
scaler = preprocessing.StandardScaler().fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
X_train.columns = data.columns

In [3]:
# predict the classlabel use k-NN (k=5) and evaluate its performance
knn1 = neighbors.KNeighborsClassifier(n_neighbors=5)
knn1.fit(X_train, y_train)

y_pred1=knn1.predict(X_test)
print(f'Accuracy of k-nn (k=5): {accuracy_score(y_test, y_pred1):.3f}')

Accuracy of k-nn (k=5): 0.820


In [4]:
# predict the classlabel use k-NN (k=10) and evaluate its performance
knn2 = neighbors.KNeighborsClassifier(n_neighbors=10)
knn2.fit(X_train, y_train)

y_pred2=knn2.predict(X_test)
print(f'Accuracy of k-nn (k=10): {accuracy_score(y_test, y_pred2):.3f}')

Accuracy of k-nn (k=10): 0.869


In [5]:
# predict the classlabel use Decision Tree and evaluate its performance
dt = DecisionTreeClassifier(criterion="entropy", max_depth=4)
dt.fit(X_train, y_train)

y_pred=dt.predict(X_test)
print(f'Accuracy of decision tree: {accuracy_score(y_test, y_pred):.3f}')

Accuracy of decision tree: 0.787


In [6]:
feature_dict = {'feature':[], 'median':[], 'mean':[], 'variance':[]}
for column in data.columns:
    feature_dict['feature'].append(column)
    feature_dict['median'].append(X_train[column].median())
    feature_dict['mean'].append(X_train[column].mean())
    feature_dict['variance'].append(X_train[column].var())
feature_df = pd.DataFrame(feature_dict)
feature_df.to_csv('task2a.csv', index = False)

Unnamed: 0,feature,median,mean,variance
0,"Access to electricity, rural (% of rural popul...",0.667067,3.6400750000000004e-17,1.008264
1,Adjusted savings: particulate emission damage ...,-0.382918,-3.6400750000000004e-17,1.008264
2,"Birth rate, crude (per 1,000 people) [SP.DYN.C...",-0.151434,6.879743e-16,1.008264
3,"Cause of death, by communicable diseases and m...",-0.497296,1.965641e-16,1.008264
4,"Cause of death, by non-communicable diseases (...",0.26666,-3.330669e-16,1.008264
5,Domestic general government health expenditure...,-0.397153,4.3680910000000004e-17,1.008264
6,Individuals using the Internet (% of populatio...,0.146725,3.276068e-17,1.008264
7,Lifetime risk of maternal death (%) [SH.MMR.RI...,-0.487696,8.190170000000001e-17,1.008264
8,Lifetime risk of maternal death (1 in: rate va...,-0.435099,-9.100189e-18,1.008264
9,"Maternal mortality ratio (modeled estimate, pe...",-0.458265,1.5470320000000003e-17,1.008264
