In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, accuracy_score
from sklearn.utils import resample
from scipy.sparse import vstack
from sklearn.metrics import precision_score, f1_score

## 1. Data preview

### Read the data file and print the first 5 lines in the initial data file.

In [2]:
## read the datafile and print the inital datafile
df = pd.read_csv("requirements_data_5.6.csv")
print(df.head())

      MARN       Full_name  Experience_years         Charge  Visa type  \
0  1800328     Kumar Rahul                 7  201 – 500 AUD        491   
1  1799100  Ramandeep Kaur                 8       500+ AUD        838   
2  1799035   Jaspreet Kaur                 8  201 – 500 AUD        151   
3  2318090    Balwant Kaur                 2   101– 200 AUD        175   
4  1570947          Nahida                10  201 – 500 AUD        423   

  Booking preference Location Success rate    Language Employment Type  \
0             Online       SA         >81%  Vietnamese       Organized   
1             Online      QLD        < 30%    Japanese     Independent   
2               Both      VIC         >81%     Serbian     Independent   
3           Inperson      VIC      31%-50%     Tagalog       Organized   
4               Both       SA        < 30%    Romanian       Organized   

   Google rating Availability  
0            4.9   4- 6 month  
1            2.9   4- 6 month  
2            4

### Print the columns' name

In [3]:
print(df.columns)

Index(['MARN', 'Full_name', 'Experience_years', 'Charge', 'Visa type',
       'Booking preference', 'Location', 'Success rate', 'Language',
       'Employment Type', 'Google rating', 'Availability'],
      dtype='object')


### Print the information of the data file

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4534 entries, 0 to 4533
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   MARN                4534 non-null   int64  
 1   Full_name           4534 non-null   object 
 2   Experience_years    4534 non-null   int64  
 3   Charge              4534 non-null   object 
 4   Visa type           4534 non-null   int64  
 5   Booking preference  4534 non-null   object 
 6   Location            4534 non-null   object 
 7   Success rate        4534 non-null   object 
 8   Language            4534 non-null   object 
 9   Employment Type     4534 non-null   object 
 10  Google rating       4534 non-null   float64
 11  Availability        4534 non-null   object 
dtypes: float64(1), int64(3), object(8)
memory usage: 425.2+ KB
None


### Print the number of rows and columns of the dataframe

In [6]:
print(df.shape)

(4534, 12)


## 2. Data preprogress

### Correct the data type of the data column

In [7]:
## MARN is the unique id for agents, and Visa type is not a numerical data, so change them into string type
df["MARN"] = df["MARN"].astype(str)
df["Visa type"] = df["Visa type"].astype(str)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4534 entries, 0 to 4533
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   MARN                4534 non-null   object 
 1   Full_name           4534 non-null   object 
 2   Experience_years    4534 non-null   int64  
 3   Charge              4534 non-null   object 
 4   Visa type           4534 non-null   object 
 5   Booking preference  4534 non-null   object 
 6   Location            4534 non-null   object 
 7   Success rate        4534 non-null   object 
 8   Language            4534 non-null   object 
 9   Employment Type     4534 non-null   object 
 10  Google rating       4534 non-null   float64
 11  Availability        4534 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 425.2+ KB
None


### Select the attributes in the dataframe

In [93]:
# select the attributes columns
attribute_columns = [
    "Experience_years",
    "Charge",
    "Visa type",
    "Booking preference",
    "Location",
    "Success rate",
    "Language",
    "Employment Type",
    "Google rating",
    "Availability"
]
# identify the numerical data and categorical data
numerical_cols = ["Experience_years", "Google rating"]
categorical_cols = [col for col in attribute_columns if col not in numerical_cols]

# create a preprocessor to transform data into vectors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),    #transform the numerical attributes to have similar weight
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)    #transform the categorical attributes into numerical vector
    ]
)

X = preprocessor.fit_transform(df[attribute_columns])
# print(X.shape)
# print(type(X))

<class 'scipy.sparse._csr.csr_matrix'>


## 3. Train the KNN-CF Model

### Initialise the KNN model, use a sample user input to test the output, including the top 3 recommendations and match scores

In [9]:
# initialise the KNN model
knn = NearestNeighbors(n_neighbors=3, metric='cosine') 
knn.fit(X)

# initial test input
user_input = {
    "Experience_years": 7,
    "Charge": "201 – 500 AUD",
    "Visa type": "491",
    "Booking preference": "Online",
    "Location": "SA",
    "Success rate": ">81%",
    "Language": "Vietnamese",
    "Employment Type": "Organized",
    "Google rating": 4.9,
    "Availability": "4- 6 month"
}
user_df = pd.DataFrame([user_input])
user_encoded = preprocessor.transform(user_df)
distances, indices = knn.kneighbors(user_encoded, n_neighbors=3)

print("Top 3 recommendations：")
for idx, (i, dist) in enumerate(zip(indices[0], distances[0])):
    name = df.iloc[i]["Full_name"]
    marn = df.iloc[i]["MARN"]
    # show the match scores（example：1 / (1 + distance)）
    percent_score = (1 / (1 + dist)) * 100
    print(f"{idx + 1}. {name} (MARN: {marn}) → match score: {percent_score:.1f}%")


Top 3 recommendations：
1. Kumar Rahul (MARN: 1800328) → match score: 100.0%
2. Richard Hyosung Lee (MARN: 1384647) → match score: 82.6%
3. Yi Luo (MARN: 1802013) → match score: 78.5%


### Use the non-return sample

In [23]:
# if not resample train
for train_size, test_size in splits:
    print(f"\nTraining with {int(train_size * 100)}% / {int(test_size * 100)}% split:")
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size, random_state=42)
    knn = NearestNeighbors(n_neighbors=3, metric='cosine')
    
    knn.fit(X_train)
    
    distances, indices = knn.kneighbors(X_test)
    
    y_pred = np.array([y_train.iloc[indices[i]].values[0] for i in range(len(indices))])

    recall = recall_score(y_test, y_pred, average='micro')  # micro 平均方式考虑所有类别
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"Recall: {recall:.4f}")
    print(f"Accuracy: {accuracy:.4f}")


Training with 70% / 30% split:
Recall: 0.0000
Accuracy: 0.0000

Training with 80% / 20% split:
Recall: 0.0000
Accuracy: 0.0000

Training with 90% / 10% split:
Recall: 0.0000
Accuracy: 0.0000


Because all the labels in our dataset show up only once, the output is not good. We need to try the resample method for training and testing.

### Define a function to calculate the accuracy, recall, precision and F1-score for the resample method.

In [33]:
def resample_cal(splits, n_total, feature_X, label_Y):
    for train_size, test_size in splits:
        print(f"\nTraining with {int(train_size * 100)}% / {int(test_size * 100)}% split:")
        
        n_train = int(train_size * n_total)
        n_test = int(test_size * n_total)
        
        X_train, y_train = resample(feature_X, label_Y, replace=True, n_samples=n_train)
        X_test, y_test = resample(feature_X, label_Y, replace=True, n_samples=n_test)
        knn = NearestNeighbors(n_neighbors=3, metric='cosine')
        knn.fit(X_train)
        distances, indices = knn.kneighbors(X_test)
    
        # generate top 3 recommendations
        top_k_preds = [y_train.iloc[indices[i]].values for i in range(len(indices))]
    
        # Recall@3
        hits = [y_test.iloc[i] in top_k_preds[i] for i in range(len(y_test))]
        recall_at_3 = sum(hits) / len(hits)
    
        # Precision@3
        correct_preds = [np.sum(top_k_preds[i] == y_test.iloc[i]) for i in range(len(y_test))]
        precision_at_3 = sum(correct_preds) / (len(y_test) * 3)
    
        # F1-score@3
        if recall_at_3 + precision_at_3 > 0:
            f1_at_3 = 2 * precision_at_3 * recall_at_3 / (precision_at_3 + recall_at_3)
        else:
            f1_at_3 = 0.0
            
        accuracy_at_3 = sum(correct_preds) / (len(y_test) * 3)
    
        print(f"Recall@3:    {recall_at_3:.4f}")
        print(f"Precision@3: {precision_at_3:.4f}")
        print(f"F1-score@3:  {f1_at_3:.4f}")
        print(f"Accuracy@3:  {accuracy_at_3:.4f}")

#### The original dataframe size: 4533; split the dataset into the training sets and testing sets with 70% and 30%, 80% and 20%, 90% and 10%. 

In [122]:
# resample methods to calculate metrics
# split the training set and testing set
y = df['MARN']
splits = [(0.7, 0.3), (0.8, 0.2), (0.9, 0.1)]
n_total = X.shape[0]
resample_cal(splits, n_total, X, y)


Training with 70% / 30% split:
Recall@3:    0.5125
Precision@3: 0.2407
F1-score@3:  0.3275
Accuracy@3:  0.2407

Training with 80% / 20% split:
Recall@3:    0.5508
Precision@3: 0.2616
F1-score@3:  0.3547
Accuracy@3:  0.2616

Training with 90% / 10% split:
Recall@3:    0.5982
Precision@3: 0.2995
F1-score@3:  0.3991
Accuracy@3:  0.2995


### Try multiple sampling to generate a larger training set for model training

In [123]:
def mul_resample_cal(splits, n_total, feature_X, label_Y, n_repeats):
    for train_size, test_size in splits:
        print(f"\nTraining with {int(train_size*100)}% / {int(test_size*100)}% split with repeat {n_repeats}:")
    
        n_train = int(train_size * n_total)
        n_test = int(test_size * n_total)
    
        X_train_all = []
        y_train_all = []
    
        for repeat in range(n_repeats):
            X_tmp, y_tmp = resample(feature_X, label_Y, replace=True, n_samples=n_train, random_state=repeat * 10)
            X_train_all.append(X_tmp)
            y_train_all.append(y_tmp)
    
        X_train_combined = vstack(X_train_all)
        y_train_combined = pd.concat(y_train_all).reset_index(drop=True)
    
        X_test, y_test = resample(feature_X, label_Y, replace=True, n_samples=n_test, random_state=100)
    
        knn = NearestNeighbors(n_neighbors=3, metric='cosine')
        knn.fit(X_train_combined)
    
        distances, indices = knn.kneighbors(X_test)
        top_k_preds = [y_train_combined.iloc[indices[i]].values for i in range(len(indices))]
    
        # Recall@3
        hits = [y_test.iloc[i] in top_k_preds[i] for i in range(len(y_test))]
        recall_at_3 = sum(hits) / len(hits)
    
        # Precision@3
        correct_preds = [np.sum(top_k_preds[i] == y_test.iloc[i]) for i in range(len(y_test))]
        precision_at_3 = sum(correct_preds) / (len(y_test) * 3)
    
        # F1-score@3
        if recall_at_3 + precision_at_3 > 0:
            f1_at_3 = 2 * precision_at_3 * recall_at_3 / (precision_at_3 + recall_at_3)
        else:
            f1_at_3 = 0.0
            
        accuracy_at_3 = sum(correct_preds) / (len(y_test) * 3)

        print(f"Accuracy@3:  {accuracy_at_3:.4f}")
        print(f"Precision@3: {precision_at_3:.4f}")
        print(f"Recall@3:    {recall_at_3:.4f}")
        print(f"F1-score@3:  {f1_at_3:.4f}")
        
    #return recall_at_3, precision_at_3, f1_at_3, accuracy_at_3

In [124]:
# resample methods to calculate metrics
# Split the training set and testing set
n_repeats = 1
mul_resample_cal(splits, n_total, X, y, n_repeats)


Training with 70% / 30% split with repeat 1:
Accuracy@3:  0.2289
Precision@3: 0.2289
Recall@3:    0.4934
F1-score@3:  0.3127

Training with 80% / 20% split with repeat 1:
Accuracy@3:  0.2535
Precision@3: 0.2535
Recall@3:    0.5353
F1-score@3:  0.3441

Training with 90% / 10% split with repeat 1:
Accuracy@3:  0.2752
Precision@3: 0.2752
Recall@3:    0.5740
F1-score@3:  0.3720


In [125]:
# resample methods to calculate metrics
# Split the training set and testing set
n_repeats = 2
mul_resample_cal(splits, n_total, X, y, n_repeats)


Training with 70% / 30% split with repeat 2:
Accuracy@3:  0.4414
Precision@3: 0.4414
Recall@3:    0.7331
F1-score@3:  0.5510

Training with 80% / 20% split with repeat 2:
Accuracy@3:  0.4989
Precision@3: 0.4989
Recall@3:    0.7792
F1-score@3:  0.6083

Training with 90% / 10% split with repeat 2:
Accuracy@3:  0.5563
Precision@3: 0.5563
Recall@3:    0.8234
F1-score@3:  0.6640


In [126]:
n_repeats = 3
mul_resample_cal(splits, n_total, X, y, n_repeats)


Training with 70% / 30% split with repeat 3:
Accuracy@3:  0.6135
Precision@3: 0.6135
Recall@3:    0.8824
F1-score@3:  0.7238

Training with 80% / 20% split with repeat 3:
Accuracy@3:  0.6733
Precision@3: 0.6733
Recall@3:    0.9106
F1-score@3:  0.7742

Training with 90% / 10% split with repeat 3:
Accuracy@3:  0.7344
Precision@3: 0.7344
Recall@3:    0.9470
F1-score@3:  0.8272


In [127]:
n_repeats = 4
mul_resample_cal(splits, n_total, X, y, n_repeats)


Training with 70% / 30% split with repeat 4:
Accuracy@3:  0.7439
Precision@3: 0.7439
Recall@3:    0.9382
F1-score@3:  0.8298

Training with 80% / 20% split with repeat 4:
Accuracy@3:  0.7995
Precision@3: 0.7995
Recall@3:    0.9614
F1-score@3:  0.8730

Training with 90% / 10% split with repeat 4:
Accuracy@3:  0.8609
Precision@3: 0.8609
Recall@3:    0.9845
F1-score@3:  0.9186


In [128]:
n_repeats = 5
mul_resample_cal(splits, n_total, X, y, n_repeats)


Training with 70% / 30% split with repeat 5:
Accuracy@3:  0.8353
Precision@3: 0.8353
Recall@3:    0.9721
F1-score@3:  0.8985

Training with 80% / 20% split with repeat 5:
Accuracy@3:  0.8837
Precision@3: 0.8837
Recall@3:    0.9890
F1-score@3:  0.9334

Training with 90% / 10% split with repeat 5:
Accuracy@3:  0.9249
Precision@3: 0.9249
Recall@3:    0.9978
F1-score@3:  0.9600


In [129]:
n_repeats = 6
mul_resample_cal(splits, n_total, X, y, n_repeats)


Training with 70% / 30% split with repeat 6:
Accuracy@3:  0.8973
Precision@3: 0.8973
Recall@3:    0.9838
F1-score@3:  0.9386

Training with 80% / 20% split with repeat 6:
Accuracy@3:  0.9371
Precision@3: 0.9371
Recall@3:    0.9945
F1-score@3:  0.9649

Training with 90% / 10% split with repeat 6:
Accuracy@3:  0.9662
Precision@3: 0.9662
Recall@3:    1.0000
F1-score@3:  0.9828


After the experiment, it was found that repeating 5 times yielded the best results, but repeating 6 times would result in overfitting.

### Change the weights of attributes for training and testing

#### define a function to set the weights

In [130]:
def set_weights(base_weights, preprocessor):
    # set the weights to the onecode matrix
    one_hot_encode = preprocessor.named_transformers_['cat']
    cat_dims = [len(cats) for cats in one_hot_encode.categories_]
    num_dims = len(numerical_cols)
    
    weights_expanded = []
    
    for i in range(num_dims):
        weights_expanded.append(np.full(1, base_weights[i]))
    
    for i, dim in enumerate(cat_dims):
        weights_expanded.append(np.full(dim, base_weights[num_dims + i]))
    
    full_weights = np.concatenate(weights_expanded)
    X_weighted = X * full_weights
    X_weighted = X_weighted.reshape(-1, 1)
    # print(X_weighted_1.shape)
    return X_weighted

In [135]:
def mul_set_weights(base_weights, preprocessor, X):
    # X 是原始稀疏矩阵，例如 TF-IDF 矩阵，或经过 ColumnTransformer 后的输出
    one_hot_encode = preprocessor.named_transformers_['cat']
    cat_dims = [len(cats) for cats in one_hot_encode.categories_]
    num_dims = len(numerical_cols)

    weights_expanded = []

    for i in range(num_dims):
        weights_expanded.append(np.full(1, base_weights[i]))

    for i, dim in enumerate(cat_dims):
        weights_expanded.append(np.full(dim, base_weights[num_dims + i]))

    full_weights = np.concatenate(weights_expanded)

    # 确保 full_weights 是 1D array，长度 == 特征维度
    if X.shape[1] != len(full_weights):
        raise ValueError(f"Shape mismatch: X has {X.shape[1]} features, but weights have {len(full_weights)} values")

    # 关键点：使用 sparse matrix 的乘法，按列缩放
    from scipy.sparse import diags
    W = diags(full_weights)  # 创建一个对角矩阵
    mul_X_weighted = X @ W       # 右乘，对每列应用权重

    return mul_X_weighted

#### 1. Test the weight set 1: feature_weights_1 = weights = {
"Language": 0.25, 
"Google rating": 0.20, 
"Success rate": 0.15, 
"Charge": 0.10,
"Visa type": 0.10, 
"Experience_years": 0.05, 
"Booking preference": 0.05,
"Location": 0.03, 
"Availability": 0.03, 
"Employment Type": 0.04
}

In [133]:
# weight set 1
import numpy as np
import pandas as pd
from scipy.sparse import issparse
base_weights_1 = np.array([0.25, 0.20, 0.15, 0.10, 0.10, 0.05, 0.05, 0.03, 0.03, 0.04])
X_weighted_1 = set_weights(base_weights_1, preprocessor)
# print(X_weighted_1)
resample_cal(splits, n_total, X_weighted_1, y)


Training with 70% / 30% split:
Recall@3:    0.0015
Precision@3: 0.0005
F1-score@3:  0.0007
Accuracy@3:  0.0005

Training with 80% / 20% split:
Recall@3:    0.0011
Precision@3: 0.0004
F1-score@3:  0.0006
Accuracy@3:  0.0004

Training with 90% / 10% split:
Recall@3:    0.0000
Precision@3: 0.0000
F1-score@3:  0.0000
Accuracy@3:  0.0000


In [136]:
mul_X_weighted_1 = mul_set_weights(base_weights_1, preprocessor, X)
mul_resample_cal(splits, n_total=X.shape[0], feature_X=mul_X_weighted_1, label_Y=y, n_repeats=5)


Training with 70% / 30% split with repeat 5:
Accuracy@3:  0.8353
Precision@3: 0.8353
Recall@3:    0.9721
F1-score@3:  0.8985

Training with 80% / 20% split with repeat 5:
Accuracy@3:  0.8837
Precision@3: 0.8837
Recall@3:    0.9890
F1-score@3:  0.9334

Training with 90% / 10% split with repeat 5:
Accuracy@3:  0.9249
Precision@3: 0.9249
Recall@3:    0.9978
F1-score@3:  0.9600


#### 2. Test the weight set 2: feature_weights_2 = {
"Language": 0.15, 
"Google rating": 0.15, 
"Success rate": 0.20, 
"Charge": 0.25,
"Visa type": 0.08, 
"Experience_years": 0.05, 
"Booking preference": 0.05,
"Location": 0.02, 
"Availability": 0.02, 
"Employment Type": 0.03
}

In [140]:
base_weights_2 = np.array([0.15, 0.15, 0.20, 0.25, 0.08, 0.05, 0.05, 0.02, 0.02, 0.03])
X_weighted_2 = set_weights(base_weights_2, preprocessor)
resample_cal(splits, n_total, X_weighted_2, y)


Training with 70% / 30% split:
Recall@3:    0.0007
Precision@3: 0.0002
F1-score@3:  0.0004
Accuracy@3:  0.0002

Training with 80% / 20% split:
Recall@3:    0.0000
Precision@3: 0.0000
F1-score@3:  0.0000
Accuracy@3:  0.0000

Training with 90% / 10% split:
Recall@3:    0.0000
Precision@3: 0.0000
F1-score@3:  0.0000
Accuracy@3:  0.0000


In [142]:
mul_X_weighted_2 = mul_set_weights(base_weights_2, preprocessor, X)
mul_resample_cal(splits, n_total=X.shape[0], feature_X=mul_X_weighted_2, label_Y=y, n_repeats=5)


Training with 70% / 30% split with repeat 5:
Accuracy@3:  0.8353
Precision@3: 0.8353
Recall@3:    0.9721
F1-score@3:  0.8985

Training with 80% / 20% split with repeat 5:
Accuracy@3:  0.8837
Precision@3: 0.8837
Recall@3:    0.9890
F1-score@3:  0.9334

Training with 90% / 10% split with repeat 5:
Accuracy@3:  0.9249
Precision@3: 0.9249
Recall@3:    0.9978
F1-score@3:  0.9600


#### 3. Test the weight set 3: feature_weights_3 = {"Language": 0.10, 
"Google rating": 0.15, 
"Success rate": 0.20, 
"Charge": 0.05,
"Visa type": 0.10, 
"Experience_years": 0.05, 
"Booking preference": 0.05,
"Location": 0.02, 
"Availability": 0.25, 
"Employment Ty
e": 0.03ype": 0.03
}

In [145]:
base_weights_3 = np.array([0.10, 0.15, 0.20, 0.05, 0.10, 0.05, 0.05, 0.02, 0.25, 0.03])
X_weighted_3 = set_weights(base_weights_3, preprocessor)
resample_cal(splits, n_total, X_weighted_3, y)


Training with 70% / 30% split:
Recall@3:    0.0007
Precision@3: 0.0002
F1-score@3:  0.0004
Accuracy@3:  0.0002

Training with 80% / 20% split:
Recall@3:    0.0000
Precision@3: 0.0000
F1-score@3:  0.0000
Accuracy@3:  0.0000

Training with 90% / 10% split:
Recall@3:    0.0000
Precision@3: 0.0000
F1-score@3:  0.0000
Accuracy@3:  0.0000


In [146]:
mul_X_weighted_3 = mul_set_weights(base_weights_3, preprocessor, X)
mul_resample_cal(splits, n_total=X.shape[0], feature_X=mul_X_weighted_3, label_Y=y, n_repeats=5)


Training with 70% / 30% split with repeat 5:
Accuracy@3:  0.8353
Precision@3: 0.8353
Recall@3:    0.9721
F1-score@3:  0.8985

Training with 80% / 20% split with repeat 5:
Accuracy@3:  0.8837
Precision@3: 0.8837
Recall@3:    0.9890
F1-score@3:  0.9334

Training with 90% / 10% split with repeat 5:
Accuracy@3:  0.9249
Precision@3: 0.9249
Recall@3:    0.9978
F1-score@3:  0.9600


#### 4. Test the weight set 4: feature_weights_4 = {
"Language": 0.15, 
"Google rating": 0.25, 
"Success rate": 0.30, 
"Charge": 0.05,
"Visa type": 0.10, 
"Experience_years": 0.05, 
"Booking preference": 0.03,
"Location": 0.02, 
"Availability": 0.02, 
"Employment Ty
}pe": 0.03

In [79]:
base_weights_4 = np.array([0.15, 0.25, 0.30, 0.05, 0.10, 0.05, 0.03, 0.02, 0.02, 0.03])
X_weighted_4 = set_weights(base_weights_4, preprocessor)
resample_cal(splits, n_total, X_weighted_4, y)


Training with 70% / 30% split:
Recall@3:    0.0022
Precision@3: 0.0007
F1-score@3:  0.0011
Accuracy@3:  0.0007

Training with 80% / 20% split:
Recall@3:    0.0011
Precision@3: 0.0004
F1-score@3:  0.0006
Accuracy@3:  0.0004

Training with 90% / 10% split:
Recall@3:    0.0066
Precision@3: 0.0022
F1-score@3:  0.0033
Accuracy@3:  0.0022


In [147]:
mul_X_weighted_4 = mul_set_weights(base_weights_4, preprocessor, X)
mul_resample_cal(splits, n_total=X.shape[0], feature_X=mul_X_weighted_4, label_Y=y, n_repeats=5)


Training with 70% / 30% split with repeat 5:
Accuracy@3:  0.8353
Precision@3: 0.8353
Recall@3:    0.9721
F1-score@3:  0.8985

Training with 80% / 20% split with repeat 5:
Accuracy@3:  0.8837
Precision@3: 0.8837
Recall@3:    0.9890
F1-score@3:  0.9334

Training with 90% / 10% split with repeat 5:
Accuracy@3:  0.9249
Precision@3: 0.9249
Recall@3:    0.9978
F1-score@3:  0.9600


#### 5. Test the weight set 5: feature_weights_5 = {
"Language": 0.15, 
"Google rating": 0.15, 
"Success rate": 0.15, 
"Charge": 0.10,
"Visa type": 0.10, 
"Experience_years": 0.10, 
"Booking preference": 0.08,
"Location": 0.05, 
"Availability": 0.05, 
"Employment Type": 0.07

In [86]:
base_weights_5 = np.array([0.15, 0.15, 0.15, 0.10, 0.10, 0.10, 0.08, 0.05, 0.05, 0.07])
X_weighted_5 = set_weights(base_weights_5, preprocessor)
resample_cal(splits, n_total, X_weighted_5, y)


Training with 70% / 30% split:
Recall@3:    0.0015
Precision@3: 0.0005
F1-score@3:  0.0007
Accuracy@3:  0.0005

Training with 80% / 20% split:
Recall@3:    0.0000
Precision@3: 0.0000
F1-score@3:  0.0000
Accuracy@3:  0.0000

Training with 90% / 10% split:
Recall@3:    0.0022
Precision@3: 0.0007
F1-score@3:  0.0011
Accuracy@3:  0.0007


In [148]:
mul_X_weighted_5 = mul_set_weights(base_weights_5, preprocessor, X)
mul_resample_cal(splits, n_total=X.shape[0], feature_X=mul_X_weighted_5, label_Y=y, n_repeats=5)


Training with 70% / 30% split with repeat 5:
Accuracy@3:  0.8353
Precision@3: 0.8353
Recall@3:    0.9721
F1-score@3:  0.8985

Training with 80% / 20% split with repeat 5:
Accuracy@3:  0.8837
Precision@3: 0.8837
Recall@3:    0.9890
F1-score@3:  0.9334

Training with 90% / 10% split with repeat 5:
Accuracy@3:  0.9249
Precision@3: 0.9249
Recall@3:    0.9978
F1-score@3:  0.9600


Finally, it shows that the results of setting weights are similar to the results before.

## Conclusion

After the experiments above, we can vividly identify that the best result for KNN-CF model is to use the multiple resample method for training.

In [149]:
n_repeats = 5
mul_resample_cal(splits, n_total, X, y, n_repeats)


Training with 70% / 30% split with repeat 5:
Accuracy@3:  0.8353
Precision@3: 0.8353
Recall@3:    0.9721
F1-score@3:  0.8985

Training with 80% / 20% split with repeat 5:
Accuracy@3:  0.8837
Precision@3: 0.8837
Recall@3:    0.9890
F1-score@3:  0.9334

Training with 90% / 10% split with repeat 5:
Accuracy@3:  0.9249
Precision@3: 0.9249
Recall@3:    0.9978
F1-score@3:  0.9600
