<a href="https://colab.research.google.com/github/victoruwazurike1/Hamoye_Internship_Notebooks/blob/main/hamoye_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import pandas as pd
import numpy as np
import sklearn.utils
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold, LeaveOneOut
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score, confusion_matrix
from imblearn.over_sampling import SMOTE 

import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.pyplot import figure

%matplotlib inline 
matplotlib.rcParams['figure.figsize'] = [14,10] # This is to set the configuration of the plots we will create in this analysis

In [2]:
df = pd.read_csv('https://query.data.world/s/wh6j7rxy2hvrn4ml75ci62apk5hgae') 

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
df.head()

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Armenia,1992,1,AreaPerCap,0.140292,0.199546,0.097188051,0.036888,0.02932,0.0,0.5032351,3A
1,Armenia,1992,1,AreaTotHA,483000.0,687000.0,334600.0,127000.0,100943.0008,0.0,1732543.0,3A
2,Armenia,1992,1,BiocapPerCap,0.159804,0.135261,0.084003213,0.013742,0.033398,0.0,0.4262086,3A
3,Armenia,1992,1,BiocapTotGHA,550176.2427,465677.9722,289207.1078,47311.55172,114982.2793,0.0,1467355.0,3A
4,Armenia,1992,1,EFConsPerCap,0.38751,0.189462,1.26e-06,0.004165,0.033398,1.114093,1.728629,3A


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72186 entries, 0 to 72185
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   country         72186 non-null  object 
 1   year            72186 non-null  int64  
 2   country_code    72186 non-null  int64  
 3   record          72186 non-null  object 
 4   crop_land       51714 non-null  float64
 5   grazing_land    51714 non-null  float64
 6   forest_land     51714 non-null  object 
 7   fishing_ground  51713 non-null  float64
 8   built_up_land   51713 non-null  float64
 9   carbon          51713 non-null  float64
 10  total           72177 non-null  float64
 11  QScore          72185 non-null  object 
dtypes: float64(6), int64(2), object(4)
memory usage: 6.6+ MB


In [5]:
# Let us check the distribution of target variable
df['QScore'].value_counts()

3A    51481
2A    10576
2B    10096
1A       16
1B       16
Name: QScore, dtype: int64

We can see that the target column is highly imbalanced

In [6]:
df.isna().sum()

country               0
year                  0
country_code          0
record                0
crop_land         20472
grazing_land      20472
forest_land       20472
fishing_ground    20473
built_up_land     20473
carbon            20473
total                 9
QScore                1
dtype: int64

In [7]:
# For simplicity we aaare going to drop the missing values
df = df.dropna()

In [8]:
# We can now check again to confirm the null values have been dropped
df.isna().sum()


country           0
year              0
country_code      0
record            0
crop_land         0
grazing_land      0
forest_land       0
fishing_ground    0
built_up_land     0
carbon            0
total             0
QScore            0
dtype: int64

In [9]:
# We have confirmed we don't have any null values
# Now we will check how many classes our target variable has 
df['QScore'].value_counts()

3A    51473
2A      224
1A       16
Name: QScore, dtype: int64

In [10]:
# We see that we are left with 3 classes but still highly imbalanced
# Let us convert this to a binary classification problem by converting class '1A' and '2A'
df[ 'QScore' ] = df[ 'QScore' ].replace([ '1A' ], '2A' )


In [11]:
# Now we have just 2 classes left
df['QScore'].value_counts()

3A    51473
2A      240
Name: QScore, dtype: int64

In [12]:
# We are going to undersample class '3A'
df_2A = df[df.QScore== '2A' ]
df_3A = df[df.QScore== '3A' ].sample(350)
data_df = df_2A.append(df_3A) 

In [13]:
data_df['QScore'].value_counts()

3A    350
2A    240
Name: QScore, dtype: int64

In [14]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 590 entries, 1536 to 38450
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   country         590 non-null    object 
 1   year            590 non-null    int64  
 2   country_code    590 non-null    int64  
 3   record          590 non-null    object 
 4   crop_land       590 non-null    float64
 5   grazing_land    590 non-null    float64
 6   forest_land     590 non-null    object 
 7   fishing_ground  590 non-null    float64
 8   built_up_land   590 non-null    float64
 9   carbon          590 non-null    float64
 10  total           590 non-null    float64
 11  QScore          590 non-null    object 
dtypes: float64(6), int64(2), object(4)
memory usage: 59.9+ KB


In [15]:
data_df = sklearn.utils.shuffle(data_df)
data_df = data_df.reset_index(drop = True)
print(data_df.shape)
data_df.QScore.value_counts()

(590, 12)


3A    350
2A    240
Name: QScore, dtype: int64

In [16]:
data_df = data_df.drop(columns = ['country_code', 'country', 'year'])

In [17]:
X = data_df.drop(columns = 'QScore')
y = data_df['QScore']

In [18]:
# Now let us spit the data into training and testing set
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
y_train.value_counts()

3A    246
2A    167
Name: QScore, dtype: int64

We can see that there is still imbalance in the class distribution so we are going to use SMOTE only on the training data to handle this.

In [19]:
# We are going to encode the categorical variables
encoder = LabelEncoder() 
x_train.record = encoder.fit_transform(x_train.record)
x_test.record = encoder.fit_transform(x_test.record)


In [20]:
smote = SMOTE(random_state = 1)
x_train_balanced, y_balanced = smote.fit_resample(x_train, y_train)

In [21]:
scaler = MinMaxScaler()
normalised_train_df = scaler.fit_transform(x_train_balanced.drop(columns = 'record'))
normalised_train_df = pd.DataFrame(normalised_train_df,
columns=x_train_balanced.drop(columns=[ 'record' ]).columns) 
normalised_train_df[ 'record' ] = x_train_balanced[ 'record' ]

In [22]:
x_test = x_test.reset_index(drop= True )
normalised_test_df = scaler.transform(x_test.drop(columns=[ 'record' ]))
normalised_test_df = pd.DataFrame(normalised_test_df,
columns=x_test.drop(columns=[ 'record' ]).columns)
normalised_test_df[ 'record' ] = x_test[ 'record']

In [23]:
# Now we can use Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(normalised_train_df, y_balanced)
log_reg.predict(normalised_test_df)

array(['2A', '3A', '2A', '2A', '2A', '3A', '2A', '2A', '2A', '2A', '3A',
       '2A', '2A', '3A', '2A', '2A', '2A', '2A', '2A', '3A', '2A', '3A',
       '3A', '3A', '2A', '2A', '3A', '2A', '3A', '3A', '2A', '2A', '3A',
       '2A', '2A', '2A', '3A', '3A', '3A', '3A', '2A', '3A', '2A', '2A',
       '2A', '2A', '2A', '3A', '2A', '3A', '2A', '2A', '2A', '2A', '3A',
       '3A', '3A', '3A', '3A', '3A', '3A', '3A', '3A', '3A', '3A', '3A',
       '3A', '2A', '2A', '3A', '2A', '2A', '2A', '2A', '2A', '3A', '3A',
       '2A', '3A', '3A', '3A', '2A', '2A', '2A', '3A', '3A', '2A', '2A',
       '3A', '2A', '2A', '3A', '3A', '2A', '3A', '3A', '3A', '3A', '2A',
       '3A', '2A', '3A', '3A', '2A', '2A', '2A', '3A', '3A', '3A', '3A',
       '3A', '3A', '3A', '2A', '2A', '3A', '3A', '3A', '2A', '3A', '3A',
       '3A', '3A', '2A', '3A', '2A', '2A', '2A', '2A', '2A', '2A', '2A',
       '3A', '3A', '2A', '3A', '2A', '2A', '3A', '3A', '3A', '2A', '3A',
       '2A', '3A', '2A', '2A', '2A', '2A', '3A', '2

Measuring classification performance

In [24]:
scores = cross_val_score(log_reg, normalised_train_df, y_balanced, cv= 5 , scoring= 'f1_macro' )
scores 

array([0.54545455, 0.49489796, 0.55027117, 0.47823364, 0.53961791])

In [25]:
# K fold Cross Validation
kf = KFold(n_splits= 5)
kf.split(normalised_train_df)
f1_scores = []

# We are going to iterate through every split to get the f1_scores

for train_index, test_index in kf.split(normalised_train_df): 
  x_train, x_test = normalised_train_df.iloc[train_index], normalised_train_df.iloc[test_index]
  y_train, y_test = y_balanced[train_index], y_balanced[test_index] 
  model = LogisticRegression().fit(x_train, y_train)
  f1_scores.append(f1_score(y_true=y_test, y_pred=model.predict(x_test), 
                            pos_label= '2A' )* 100 ) 
f1_scores

[52.63157894736842,
 49.056603773584904,
 51.485148514851474,
 55.172413793103445,
 0.0]

In [26]:
# Stratified K fold Validation
skf = StratifiedKFold(n_splits= 5 , shuffle= True , random_state= 1 )
f1_scores = []

# We are going to iterate through every split to get the f1_scores

for train_index, test_index in kf.split(normalised_train_df): 
  x_train, x_test = normalised_train_df.iloc[train_index], normalised_train_df.iloc[test_index]
  y_train, y_test = y_balanced[train_index], y_balanced[test_index] 
  model = LogisticRegression().fit(x_train, y_train)
  f1_scores.append(f1_score(y_true=y_test, y_pred=model.predict(x_test), 
                            pos_label= '2A' )* 100 ) 
f1_scores

[52.63157894736842,
 49.056603773584904,
 51.485148514851474,
 55.172413793103445,
 0.0]

Leave One Out Cross Validation

In [29]:
loo = LeaveOneOut()
scores = cross_val_score(LogisticRegression(), normalised_train_df, y_balanced, cv = loo, scoring = 'f1_macro')
average_score = scores.mean() * 100
average_score

52.642276422764226

In [39]:
y_train.shape

(394,)

Confusion Matrix

In [42]:
new_predictions = log_reg.predict(x_test)
new_predictions.shape
cnf_mat = confusion_matrix(y_true = y_test, y_pred = new_predictions, labels= ['2A', '3A'])
cnf_mat

array([[45, 41],
       [ 5,  7]])

In [43]:
accuracy = accuracy_score(y_true=y_test, y_pred=new_predictions)
print( 'Accuracy: {}' .format(round(accuracy* 100 ), 2 )) #prints 53.0

Accuracy: 53


Precision


In [44]:
precision = precision_score(y_true=y_test, y_pred=new_predictions, pos_label= '2A' )
print( 'Precision: {}' .format(round(precision* 100 ), 2 ))

Precision: 90


In [45]:
recall = recall_score(y_true=y_test, y_pred=new_predictions, pos_label= '2A' )
print( 'Recall: {}' .format(round(recall* 100 ), 2 )) 

Recall: 52


In [46]:
f1 = f1_score(y_true=y_test, y_pred=new_predictions, pos_label= '2A' )
print( 'F1: {}' .format(round(f1* 100 ), 2 )) 

F1: 66
