In [3]:
#Setting Up the Data
import os
import pandas as pd

# Define the path to the STA file
folder_path = "Nepal/Nepal/Nepal 2022/NP_2022_DHS_11092023_2251_202866/NPHR82DT"
file_name = "NPHR82FL.DTA"
sta_file_path = os.path.join(folder_path, file_name)

# Check if the file exists
if os.path.exists(sta_file_path):
    # Read the STA file into a pandas DataFrame
    df = pd.read_stata(sta_file_path)
    
    # Display information about the DataFrame
    print("DataFrame Info:")
    print(df.info())
    
    # Display the DataFrame
    print("\nContents of the STA file:")
    print(df)
else:
    print(f"The file {file_name} does not exist in the specified folder.")

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13786 entries, 0 to 13785
Columns: 3027 entries, hhid to hai13z_5
dtypes: category(2659), float64(319), int16(13), int32(4), int8(15), object(17)
memory usage: 73.2+ MB
None

Contents of the STA file:
               hhid hv000  hv001  hv002 hv003  hv004   hv005  hv006  hv007  \
0             1   1   NP8      1      1     1      1  934601   magh   2078   
1             1   6   NP8      1      6     1      1  934601   magh   2078   
2             1   8   NP8      1      8     1      1  934601   magh   2078   
3             1   9   NP8      1      9     2      1  934601   magh   2078   
4             1  11   NP8      1     11     4      1  934601   magh   2078   
...             ...   ...    ...    ...   ...    ...     ...    ...    ...   
13781       476 132   NP8    476    132     1    476  481982  poush   2078   
13782       476 137   NP8    476    137     1    476  481982  poush   2078   
13783       476 142   NP8    47

In [4]:
#Create a new dataframe with the classes and the room proportion
new_df = df[["shecoreg", "hfs_mod", "hfs1", "hv106_01", "hv009", "hv216"]].copy()
new_df["RoomProp"] = new_df["hv009"]/new_df["hv216"]
new_df

Unnamed: 0,shecoreg,hfs_mod,hfs1,hv106_01,hv009,hv216,RoomProp
0,mountain,0.0,no,basic,7,2,3.500000
1,mountain,35698168.0,yes,basic,2,1,2.000000
2,mountain,74705556.0,yes,"no education, preschool/early childhood education",7,2,3.500000
3,mountain,986229.0,no,"no education, preschool/early childhood education",6,2,3.000000
4,mountain,0.0,no,"no education, preschool/early childhood education",6,3,2.000000
...,...,...,...,...,...,...,...
13781,terai,0.0,no,"no education, preschool/early childhood education",5,2,2.500000
13782,terai,986229.0,yes,basic,3,3,1.000000
13783,terai,35698168.0,yes,"no education, preschool/early childhood education",4,3,1.333333
13784,terai,7608985.0,no,basic,4,2,2.000000


In [5]:
#Drop the mountain class
new_df.drop(new_df[new_df["shecoreg"] == "mountain"].index, inplace=True)
new_df

Unnamed: 0,shecoreg,hfs_mod,hfs1,hv106_01,hv009,hv216,RoomProp
172,hill,986229.0,yes,basic,2,2,1.000000
173,hill,0.0,no,"no education, preschool/early childhood education",6,2,3.000000
174,hill,0.0,no,basic,4,1,4.000000
175,hill,7608985.0,no,"no education, preschool/early childhood education",5,2,2.500000
176,hill,99467046.0,yes,"no education, preschool/early childhood education",5,1,5.000000
...,...,...,...,...,...,...,...
13781,terai,0.0,no,"no education, preschool/early childhood education",5,2,2.500000
13782,terai,986229.0,yes,basic,3,3,1.000000
13783,terai,35698168.0,yes,"no education, preschool/early childhood education",4,3,1.333333
13784,terai,7608985.0,no,basic,4,2,2.000000


In [6]:
#Combine the secondary and higher education class together, since the higher education tier has very little.
new_df['hv106_01'] = new_df['hv106_01'].replace(['secondary', 'higher'], 'secondary/higher')
new_df.drop(new_df[new_df["hv106_01"] == "don't know"].index, inplace=True)
new_df

Unnamed: 0,shecoreg,hfs_mod,hfs1,hv106_01,hv009,hv216,RoomProp
172,hill,986229.0,yes,basic,2,2,1.000000
173,hill,0.0,no,"no education, preschool/early childhood education",6,2,3.000000
174,hill,0.0,no,basic,4,1,4.000000
175,hill,7608985.0,no,"no education, preschool/early childhood education",5,2,2.500000
176,hill,99467046.0,yes,"no education, preschool/early childhood education",5,1,5.000000
...,...,...,...,...,...,...,...
13781,terai,0.0,no,"no education, preschool/early childhood education",5,2,2.500000
13782,terai,986229.0,yes,basic,3,3,1.000000
13783,terai,35698168.0,yes,"no education, preschool/early childhood education",4,3,1.333333
13784,terai,7608985.0,no,basic,4,2,2.000000


In [7]:
#Remove outliers
from scipy import stats

# Calculate the z-scores
z_scores = stats.zscore(new_df['RoomProp'])

# establish a threshold of 3
threshold = 3

# Remove outliers based on the threshold
new_df = new_df[(z_scores <= threshold) & (z_scores >= -threshold)]

new_df

Unnamed: 0,shecoreg,hfs_mod,hfs1,hv106_01,hv009,hv216,RoomProp
172,hill,986229.0,yes,basic,2,2,1.000000
173,hill,0.0,no,"no education, preschool/early childhood education",6,2,3.000000
174,hill,0.0,no,basic,4,1,4.000000
175,hill,7608985.0,no,"no education, preschool/early childhood education",5,2,2.500000
176,hill,99467046.0,yes,"no education, preschool/early childhood education",5,1,5.000000
...,...,...,...,...,...,...,...
13781,terai,0.0,no,"no education, preschool/early childhood education",5,2,2.500000
13782,terai,986229.0,yes,basic,3,3,1.000000
13783,terai,35698168.0,yes,"no education, preschool/early childhood education",4,3,1.333333
13784,terai,7608985.0,no,basic,4,2,2.000000


In [8]:
#Quickly get the average of RoomProp. For this study, it will be our classification threshold
new_df["RoomProp"].mean()

2.028334834269686

In [9]:
#Create a new categorical variable called overcrowded based on RoomProp
new_df['overcrowded'] = (new_df['RoomProp'] >= 2)
new_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['overcrowded'] = (new_df['RoomProp'] >= 2)


Unnamed: 0,shecoreg,hfs_mod,hfs1,hv106_01,hv009,hv216,RoomProp,overcrowded
172,hill,986229.0,yes,basic,2,2,1.000000,False
173,hill,0.0,no,"no education, preschool/early childhood education",6,2,3.000000,True
174,hill,0.0,no,basic,4,1,4.000000,True
175,hill,7608985.0,no,"no education, preschool/early childhood education",5,2,2.500000,True
176,hill,99467046.0,yes,"no education, preschool/early childhood education",5,1,5.000000,True
...,...,...,...,...,...,...,...,...
13781,terai,0.0,no,"no education, preschool/early childhood education",5,2,2.500000,True
13782,terai,986229.0,yes,basic,3,3,1.000000,False
13783,terai,35698168.0,yes,"no education, preschool/early childhood education",4,3,1.333333,False
13784,terai,7608985.0,no,basic,4,2,2.000000,True


In [10]:
#Drop unneeded columns 
new_df = new_df.drop(columns=['hv009','hv216','RoomProp','hfs_mod'])
new_df

Unnamed: 0,shecoreg,hfs1,hv106_01,overcrowded
172,hill,yes,basic,False
173,hill,no,"no education, preschool/early childhood education",True
174,hill,no,basic,True
175,hill,no,"no education, preschool/early childhood education",True
176,hill,yes,"no education, preschool/early childhood education",True
...,...,...,...,...
13781,terai,no,"no education, preschool/early childhood education",True
13782,terai,yes,basic,False
13783,terai,yes,"no education, preschool/early childhood education",False
13784,terai,no,basic,True


In [11]:
#import required packages
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.metrics import classification_report
from sklearn.svm import SVC

In [12]:
#Naive Bayes

#convert the categorical variables into number values via ordinal encoding
df_nb = new_df.copy()
df_nb = pd.DataFrame(preprocessing.OrdinalEncoder().fit_transform(df_nb), columns=df_nb.columns)
df_nb = df_nb.dropna()
# OrdinalEncoder will skip numerical values. LabelEncoder also works as they share the same functionality.
display(df_nb.head(5))

Unnamed: 0,shecoreg,hfs1,hv106_01,overcrowded
0,0.0,2.0,0.0,0.0
1,0.0,1.0,1.0,1.0
2,0.0,1.0,0.0,1.0
3,0.0,1.0,1.0,1.0
4,0.0,2.0,1.0,1.0


In [13]:
#Split into a train and test set
nb_train, nb_test = train_test_split(df_nb, test_size=0.2)
X_nb_train, y_nb_train = nb_train.drop(columns=['overcrowded']), nb_train['overcrowded']
X_nb_test, y_nb_test = nb_test.drop(columns=['overcrowded']), nb_test['overcrowded']
print(X_nb_train.shape, X_nb_test.shape)

(9825, 3) (2457, 3)


In [14]:
import numpy as np

In [15]:
for column in df_nb.columns:
    print(df_nb[column].value_counts())

shecoreg
0.0    6609
1.0    5673
Name: count, dtype: int64
hfs1
1.0    7868
2.0    4412
0.0       2
Name: count, dtype: int64
hv106_01
0.0    4624
1.0    4354
2.0    3304
Name: count, dtype: int64
overcrowded
1.0    6589
0.0    5693
Name: count, dtype: int64


In [16]:
#Run and classify categorical Naive Bayes
clf_cat = CategoricalNB()

clf_cat.fit(X_nb_train, np.asarray(y_nb_train))

print(classification_report(y_nb_test, clf_cat.predict(X_nb_test)))

              precision    recall  f1-score   support

         0.0       0.54      0.50      0.52      1119
         1.0       0.60      0.63      0.62      1338

    accuracy                           0.58      2457
   macro avg       0.57      0.57      0.57      2457
weighted avg       0.57      0.58      0.57      2457



In [17]:
#Same thing, but implement a Grid Search to fine tune the hyperparameters

from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import classification_report


# Define the hyperparameter grid to search
param_grid = {
    'alpha': [1e-10, 1e-5, 0.1, 0.5, 1.0],  # Adjust values based on your needs
    'fit_prior': [True, False]
}

# Create a CategoricalNB classifier
clf_cat = CategoricalNB()

# Use GridSearchCV to search for the best hyperparameters
grid_search = GridSearchCV(clf_cat, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_nb_train, np.asarray(y_nb_train))

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Evaluate the model with the best hyperparameters
best_clf = grid_search.best_estimator_
print(classification_report(y_nb_test, best_clf.predict(X_nb_test)))


Best Hyperparameters: {'alpha': 1e-10, 'fit_prior': True}
              precision    recall  f1-score   support

         0.0       0.54      0.50      0.52      1119
         1.0       0.60      0.63      0.62      1338

    accuracy                           0.58      2457
   macro avg       0.57      0.57      0.57      2457
weighted avg       0.57      0.58      0.57      2457



In [18]:
#SVM
df_svm = new_df.copy()

#Convert to one hot encoding
df_svm = pd.get_dummies(df_svm)
df_svm = df_svm.astype(int)
df_svm

Unnamed: 0,overcrowded,shecoreg_mountain,shecoreg_hill,shecoreg_terai,hfs1_no,hfs1_yes,hfs1_don't know,"hv106_01_no education, preschool/early childhood education",hv106_01_basic,hv106_01_secondary/higher,hv106_01_don't know
172,0,0,1,0,0,1,0,0,1,0,0
173,1,0,1,0,1,0,0,1,0,0,0
174,1,0,1,0,1,0,0,0,1,0,0
175,1,0,1,0,1,0,0,1,0,0,0
176,1,0,1,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
13781,1,0,0,1,1,0,0,1,0,0,0
13782,0,0,0,1,0,1,0,0,1,0,0
13783,0,0,0,1,0,1,0,1,0,0,0
13784,1,0,0,1,1,0,0,0,1,0,0


In [19]:
svm_train, svm_test = train_test_split(df_svm, test_size=0.2)
X_svm_train, y_svm_train = svm_train.drop(columns=['overcrowded']), svm_train['overcrowded']
X_svm_test, y_svm_test = svm_test.drop(columns=['overcrowded']), svm_test['overcrowded']

In [20]:
#Linear Kernel
svc_li = SVC(kernel='linear')

scaler = preprocessing.StandardScaler()
scaler.fit(X_svm_train)

Z_svm_train = scaler.transform(X_svm_train)
Z_svm_test = scaler.transform(X_svm_test)

svc_li.fit(Z_svm_train, np.asarray(y_svm_train))

print('Linear Kernel')
print(classification_report(y_svm_test, svc_li.predict(Z_svm_test)))

Linear Kernel
              precision    recall  f1-score   support

           0       0.54      0.39      0.45      1130
           1       0.58      0.72      0.64      1327

    accuracy                           0.57      2457
   macro avg       0.56      0.55      0.55      2457
weighted avg       0.56      0.57      0.56      2457



In [19]:
#WARNING: THIS TAKES A LITTLE WHILE TO RUN

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn import preprocessing


# Create a linear kernel SVC
svc_li = SVC(kernel='linear')

# Define the hyperparameter grid to search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100]  # Adjust values based on your needs
}

# Scale the data
scaler = preprocessing.StandardScaler()
scaler.fit(X_svm_train)
Z_svm_train = scaler.transform(X_svm_train)
Z_svm_test = scaler.transform(X_svm_test)

# Use GridSearchCV to search for the best hyperparameters
grid_search = GridSearchCV(svc_li, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(Z_svm_train, np.asarray(y_svm_train))

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Evaluate the model with the best hyperparameters
best_svc_li = grid_search.best_estimator_
print('Linear Kernel')
print(classification_report(y_svm_test, best_svc_li.predict(Z_svm_test)))


Best Hyperparameters: {'C': 0.001}
Linear Kernel
              precision    recall  f1-score   support

           0       0.53      0.55      0.54      1130
           1       0.61      0.59      0.60      1327

    accuracy                           0.57      2457
   macro avg       0.57      0.57      0.57      2457
weighted avg       0.57      0.57      0.57      2457



In [21]:
#RBF Kernel
svc_rbf = SVC(kernel='rbf')

scaler = preprocessing.StandardScaler()
scaler.fit(X_svm_train)

Z_svm_train = scaler.transform(X_svm_train)
Z_svm_test = scaler.transform(X_svm_test)

svc_rbf.fit(Z_svm_train, np.asarray(y_svm_train))

print('RBF Kernel')
print(classification_report(y_svm_test, svc_rbf.predict(Z_svm_test)))

RBF Kernel
              precision    recall  f1-score   support

           0       0.53      0.49      0.51      1130
           1       0.59      0.63      0.61      1327

    accuracy                           0.56      2457
   macro avg       0.56      0.56      0.56      2457
weighted avg       0.56      0.56      0.56      2457



In [22]:
df_rf = new_df.copy()
df_rf = pd.DataFrame(preprocessing.OrdinalEncoder().fit_transform(df_rf), columns=df_rf.columns)
df_rf = df_rf.dropna()
# OrdinalEncoder will skip numerical values. LabelEncoder also works as they share the same functionality.
display(df_rf.head(5))

Unnamed: 0,shecoreg,hfs1,hv106_01,overcrowded
0,0.0,2.0,0.0,0.0
1,0.0,1.0,1.0,1.0
2,0.0,1.0,0.0,1.0
3,0.0,1.0,1.0,1.0
4,0.0,2.0,1.0,1.0


In [23]:
#Random Forest Classifier

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


features = df_rf[['shecoreg', 'hfs1', 'hv106_01']]
target = df_rf['overcrowded']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
predictions = rf_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)


Accuracy: 0.564916564916565
Classification Report:
               precision    recall  f1-score   support

         0.0       0.53      0.49      0.51      1137
         1.0       0.59      0.63      0.61      1320

    accuracy                           0.56      2457
   macro avg       0.56      0.56      0.56      2457
weighted avg       0.56      0.56      0.56      2457



In [24]:
#Feature Importance

# Identify the most important features
feature_importances = rf_classifier.feature_importances_

# Create a DataFrame to display the feature importances
feature_importance_df = pd.DataFrame({'Feature': features.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the sorted feature importances
print("\nFeature Importances:")
print(feature_importance_df)


Feature Importances:
    Feature  Importance
1      hfs1    0.467413
0  shecoreg    0.360393
2  hv106_01    0.172194


In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Extract features and target variable
features = df_rf[['shecoreg', 'hfs1', 'hv106_01']]
target = df_rf['overcrowded']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, 
                           cv=5, scoring='accuracy', n_jobs=-1)

# Fit the classifier to the training data with hyperparameter tuning
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_

# Create a Random Forest Classifier with the best parameters
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params)

# Fit the classifier to the training data
best_rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
predictions = best_rf_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

print("Best Hyperparameters:", best_params)
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Accuracy: 0.564916564916565
Classification Report:
               precision    recall  f1-score   support

         0.0       0.53      0.49      0.51      1137
         1.0       0.59      0.63      0.61      1320

    accuracy                           0.56      2457
   macro avg       0.56      0.56      0.56      2457
weighted avg       0.56      0.56      0.56      2457

