In [1]:
import pandas as pd

### Unpickling my model and scalers

In [2]:
import pickle

# Load the pickled scaler
with open('scaler.pkl', 'rb') as f:
    transformer = pickle.load(f)

# Load the pickled encoder
with open('encoder.pkl', 'rb') as f:
    encoder = pickle.load(f)

# Load the pickled model
with open('knn_model.pkl', 'rb') as f:
    model = pickle.load(f)


### Reading my dataframes

In [3]:
kiva_df = pd.read_csv('kiva_cleaned.csv')
kiva_df.head()

Unnamed: 0.1,Unnamed: 0,currency,sector,loan_amount,gender,repayment_interval,term_in_months,country,lender_count,posted_year,posted_month,disbursed_year,disbursed_month
0,0,USD,Retail,100000,male,at_end,8,Namibia,1231,2023,April,2023,April
1,1,PYG,Retail,2025,female,monthly,6,Paraguay,23,2023,April,2023,April
2,2,USD,Food,5000,female,monthly,8,Congo (DRC),98,2023,March,2023,March
3,3,RWF,Services,13600,female,irregularly,38,Rwanda,417,2023,April,2023,April
4,4,PYG,Food,2700,female,monthly,7,Paraguay,44,2023,March,2023,March


In [4]:
kiva_df= kiva_df.drop(columns=['Unnamed: 0'],axis=1)

In [5]:
kiva_df.head()

Unnamed: 0,currency,sector,loan_amount,gender,repayment_interval,term_in_months,country,lender_count,posted_year,posted_month,disbursed_year,disbursed_month
0,USD,Retail,100000,male,at_end,8,Namibia,1231,2023,April,2023,April
1,PYG,Retail,2025,female,monthly,6,Paraguay,23,2023,April,2023,April
2,USD,Food,5000,female,monthly,8,Congo (DRC),98,2023,March,2023,March
3,RWF,Services,13600,female,irregularly,38,Rwanda,417,2023,April,2023,April
4,PYG,Food,2700,female,monthly,7,Paraguay,44,2023,March,2023,March


### Dropping my loan amount column as it is my target and doing some cleaning

In [6]:
kiva_df1 = kiva_df.drop(columns=['loan_amount'],axis=1)
kiva_df1.head()

Unnamed: 0,currency,sector,gender,repayment_interval,term_in_months,country,lender_count,posted_year,posted_month,disbursed_year,disbursed_month
0,USD,Retail,male,at_end,8,Namibia,1231,2023,April,2023,April
1,PYG,Retail,female,monthly,6,Paraguay,23,2023,April,2023,April
2,USD,Food,female,monthly,8,Congo (DRC),98,2023,March,2023,March
3,RWF,Services,female,irregularly,38,Rwanda,417,2023,April,2023,April
4,PYG,Food,female,monthly,7,Paraguay,44,2023,March,2023,March


In [7]:
kiva_df1['num_male_borrowers'] = kiva_df1['gender'].str.count('male')
kiva_df1['num_female_borrowers'] = kiva_df1['gender'].str.count('female')

In [8]:
kiva_df1['posted_year'] = kiva_df1['posted_year'].astype(object)
kiva_df1['disbursed_year'] = kiva_df1['disbursed_year'].astype(object)

In [21]:
kiva_df1.dtypes

currency                 object
sector                   object
gender                   object
repayment_interval       object
term_in_months            int64
country                  object
lender_count              int64
posted_year              object
posted_month             object
disbursed_year           object
disbursed_month          object
num_male_borrowers        int64
num_female_borrowers      int64
pred_loan_amount        float64
dtype: object

In [9]:
import numpy as np

### Splitting into numerical and categorical

In [10]:
num_val_X = kiva_df1.select_dtypes(np.number)
cat_val_X = kiva_df1.select_dtypes(object) 


In [None]:
num_val_X.head()

In [12]:
cat_val_X.head()

Unnamed: 0,currency,sector,gender,repayment_interval,country,posted_year,posted_month,disbursed_year,disbursed_month
0,USD,Retail,male,at_end,Namibia,2023,April,2023,April
1,PYG,Retail,female,monthly,Paraguay,2023,April,2023,April
2,USD,Food,female,monthly,Congo (DRC),2023,March,2023,March
3,RWF,Services,female,irregularly,Rwanda,2023,April,2023,April
4,PYG,Food,female,monthly,Paraguay,2023,March,2023,March


### Doing the scaling on my validation set

In [13]:
num_scaled = transformer.transform(num_val_X)
cat_scaled = encoder.transform(cat_val_X).toarray()

Feature names unseen at fit time:
- gender
Feature names seen at fit time, yet now missing:
- mode_gender_of_borrower



In [14]:
pd.DataFrame(cat_scaled)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,188,189,190,191,192,193,194,195,196,197
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6823,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
val_f = np.concatenate([num_scaled,cat_scaled],axis=1)

In [16]:
val_f

array([[-0.66481174, 43.58015429, -0.29913262, ...,  0.        ,
         0.        ,  0.        ],
       [-0.89623212,  0.08748972, -0.29913262, ...,  0.        ,
         0.        ,  0.        ],
       [-0.66481174,  2.78777932, -0.29913262, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.66481174, -0.74059909, -0.29913262, ...,  0.        ,
         0.        ,  0.        ],
       [-0.54910156, -0.70459523, -0.29913262, ...,  0.        ,
         0.        ,  0.        ],
       [-0.201971  , -0.74059909, -0.29913262, ...,  0.        ,
         0.        ,  0.        ]])

### Doing prediction on validation set.

In [17]:
ypred = model.predict(val_f)



In [18]:
kiva_df1['pred_loan_amount'] = model.predict(val_f)



In [19]:
kiva_df1

Unnamed: 0,currency,sector,gender,repayment_interval,term_in_months,country,lender_count,posted_year,posted_month,disbursed_year,disbursed_month,num_male_borrowers,num_female_borrowers,pred_loan_amount
0,USD,Retail,male,at_end,8,Namibia,1231,2023,April,2023,April,1,0,41262.500000
1,PYG,Retail,female,monthly,6,Paraguay,23,2023,April,2023,April,1,1,616.666667
2,USD,Food,female,monthly,8,Congo (DRC),98,2023,March,2023,March,1,1,2404.166667
3,RWF,Services,female,irregularly,38,Rwanda,417,2023,April,2023,April,1,1,10395.833333
4,PYG,Food,female,monthly,7,Paraguay,44,2023,March,2023,March,1,1,1258.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6821,IDR,Food,female,monthly,13,Indonesia,0,2023,April,2023,April,1,1,370.833333
6822,IDR,Food,female,monthly,13,Indonesia,0,2023,April,2023,April,1,1,370.833333
6823,XOF,Agriculture,female,at_end,8,Senegal,0,2023,April,2023,April,1,1,312.500000
6824,XOF,Retail,female,monthly,9,Togo,1,2023,April,2023,April,1,1,154.166667


In [22]:
kiva_df1.dtypes

currency                 object
sector                   object
gender                   object
repayment_interval       object
term_in_months            int64
country                  object
lender_count              int64
posted_year              object
posted_month             object
disbursed_year           object
disbursed_month          object
num_male_borrowers        int64
num_female_borrowers      int64
pred_loan_amount        float64
dtype: object

In [23]:
kiva_df1['pred_loan_amount']= kiva_df1['pred_loan_amount'].astype(int)

In [24]:
kiva_df1.head(20)

Unnamed: 0,currency,sector,gender,repayment_interval,term_in_months,country,lender_count,posted_year,posted_month,disbursed_year,disbursed_month,num_male_borrowers,num_female_borrowers,pred_loan_amount
0,USD,Retail,male,at_end,8,Namibia,1231,2023,April,2023,April,1,0,41262
1,PYG,Retail,female,monthly,6,Paraguay,23,2023,April,2023,April,1,1,616
2,USD,Food,female,monthly,8,Congo (DRC),98,2023,March,2023,March,1,1,2404
3,RWF,Services,female,irregularly,38,Rwanda,417,2023,April,2023,April,1,1,10395
4,PYG,Food,female,monthly,7,Paraguay,44,2023,March,2023,March,1,1,1258
5,VND,Housing,female,monthly,22,Vietnam,51,2023,March,2023,March,1,1,1612
6,MZN,Agriculture,female,monthly,24,Mozambique,16,2023,April,2023,April,1,1,1245
7,PYG,Education,female,monthly,6,Paraguay,123,2023,April,2023,April,1,1,3220
8,RWF,Food,female,monthly,6,Rwanda,46,2023,April,2023,April,1,1,1258
9,MXN,Retail,female,monthly,5,Mexico,71,2023,April,2023,April,1,1,2087


In [25]:
kiva_df.head(20)

Unnamed: 0,currency,sector,loan_amount,gender,repayment_interval,term_in_months,country,lender_count,posted_year,posted_month,disbursed_year,disbursed_month
0,USD,Retail,100000,male,at_end,8,Namibia,1231,2023,April,2023,April
1,PYG,Retail,2025,female,monthly,6,Paraguay,23,2023,April,2023,April
2,USD,Food,5000,female,monthly,8,Congo (DRC),98,2023,March,2023,March
3,RWF,Services,13600,female,irregularly,38,Rwanda,417,2023,April,2023,April
4,PYG,Food,2700,female,monthly,7,Paraguay,44,2023,March,2023,March
5,VND,Housing,2125,female,monthly,22,Vietnam,51,2023,March,2023,March
6,MZN,Agriculture,10975,female,monthly,24,Mozambique,16,2023,April,2023,April
7,PYG,Education,5200,female,monthly,6,Paraguay,123,2023,April,2023,April
8,RWF,Food,8975,female,monthly,6,Rwanda,46,2023,April,2023,April
9,MXN,Retail,6050,female,monthly,5,Mexico,71,2023,April,2023,April


In [27]:
# select only the two columns you need from each dataframe
pred_loan_amount = kiva_df1['pred_loan_amount']
loan_amount = kiva_df['loan_amount']

# concatenate the two columns horizontally into a new dataframe
merged_df1 = pd.concat([pred_loan_amount, loan_amount], axis=1)

# print the first few rows of the merged dataframe to check the result
merged_df1.head(20)

Unnamed: 0,pred_loan_amount,loan_amount
0,41262,100000
1,616,2025
2,2404,5000
3,10395,13600
4,1258,2700
5,1612,2125
6,1245,10975
7,3220,5200
8,1258,8975
9,2087,6050


In [28]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

r2_score(loan_amount, pred_loan_amount)


0.23810230115395814

### As you see, since there is a difference in details in my both datasets, I got a low score for my validation set. However, for some rows , i got some good predictions. The conclusion is that if i scraped more data and make the column counts similar, i can get a good prediction for my data model.