##### The cell below is for you to keep track of the libraries used and install those libraries quickly
##### Ensure that the proper library names are used and the syntax of `%pip install PACKAGE_NAME` is followed

In [None]:
%pip install pandas 
%pip install matplotlib
%pip install imblearn
%pip install pyarrow
%pip install fastparquet
# add commented pip installation lines for packages used as shown above for ease of testing
# the line should be of the format %pip install PACKAGE_NAME 

## **DO NOT CHANGE** the filepath variable
##### Instead, create a folder named 'data' in your current working directory and 
##### have the .parquet file inside that. A relative path *must* be used when loading data into pandas

In [1]:
# Can have as many cells as you want for code
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier 
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
filepath = "./data/catB_train.parquet" 
# the initialised filepath MUST be a relative path to a folder named data that contains the parquet file

### **ALL** Code for machine learning and dataset analysis should be entered below. 
##### Ensure that your code is clear and readable.
##### Comments and Markdown notes are advised to direct attention to pieces of code you deem useful.

In [2]:
#pd.set_option('display.max_columns', None) 
test_df = pd.read_parquet("./data/catB_train.parquet")
#show no. of each numpy type
df = test_df
test_df.dtypes.value_counts()

object     214
int64       46
float64     44
dtype: int64

In [4]:
Y = test_df["f_purchase_lh"].fillna(0)
test_df = test_df.drop(columns=["f_purchase_lh"])
#remove columns without distinguishing capability
unique_counts = test_df.nunique()
columns_to_drop = unique_counts[unique_counts < 2].index
test_df = test_df.drop(columns=columns_to_drop)

unique_counts = unique_counts.sort_index()
unique_counts = test_df.nunique()
output_file_path = 'unique_counts.txt'
 
with open(output_file_path, 'w') as file:
    for column_name,no in unique_counts.iteritems():
        file.write(column_name + '\t' + str(no) + '\n')
    file.write('\n')


test_df = df_dropped
missing_data = test_df.isnull()
missing_count = missing_data.sum()
missing_count = missing_count.sort_index()
output_file_path = 'column_missing_output.txt'
 
with open(output_file_path, 'w') as file:
    for column_name,no in missing_count.iteritems():
        file.write(column_name + '\t' + str(no) + '\n')
    file.write('\n')   
float_columns = []
#identify columns with mostly numerical data, convert them to numeric
for column in test_df.columns:
    try:
        if test_df[column].str.contains(r'^\d*\.?\d+$').mean() > 0.5:
            float_columns.append(column)
    except AttributeError:
        pass
for column in float_columns:
    test_df[column] = pd.to_numeric(test_df[column], errors='coerce')

  for column_name,no in unique_counts.iteritems():


In [6]:
#get objects
categorical_columns = test_df.select_dtypes(include='object')
#categorical_columns
columns_to_encode = []
training_cats = {}
for column in categorical_columns:
    if test_df[column].nunique() < 50:
        columns_to_encode.append(column)
        test_df[column] = pd.Categorical(test_df[column]).codes
        training_cats[column] = pd.Categorical(test_df[column]).categories

In [None]:
# Generate unique counts and missing output for reference.

unique_counts = unique_counts.sort_index()
unique_counts = test_df.nunique()
output_file_path = 'unique_counts.txt'
 
with open(output_file_path, 'w') as file:
    for column_name,no in unique_counts.iteritems():
        file.write(column_name + '\t' + str(no) + '\n')
    file.write('\n')

missing_data = test_df.isnull()
missing_count = missing_data.sum()
missing_count = missing_count.sort_index()
output_file_path = 'column_missing_output.txt'
 
with open(output_file_path, 'w') as file:
    for column_name,no in missing_count.iteritems():
        file.write(column_name + '\t' + str(no) + '\n')
    file.write('\n')   

In [8]:
numerical_columns = test_df.select_dtypes(include='number')
numerical_columns = numerical_columns.fillna(numerical_columns.mean())
X = numerical_columns#.drop(columns=b)
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=0)
X_train, y_train = SMOTE().fit_resample(X_train, y_train)

# KNN
# Finding most appropriate number of neighbours
neighbors = np.arange(1, 12)
train_accuracies = {}
test_accuracies = {}

for neighbor in neighbors:
	# Set up a KNN Classifier
	knn = KNeighborsClassifier(n_neighbors=neighbor)
  
	#Â Fit the model
	knn.fit(X_train, y_train)
  
	# Compute accuracy
	train_accuracies[neighbor] = knn.score(X_train, y_train)
	test_accuracies[neighbor] = knn.score(X_val, y_val)
print(neighbors, '\n', train_accuracies, '\n', test_accuracies)

[ 1  2  3  4  5  6  7  8  9 10 11] 
 {1: 0.9999637917300311, 2: 0.9945687595046708, 3: 0.9582518647259034, 4: 0.9600622782243464, 5: 0.9349699471359258, 6: 0.9376855673835904, 7: 0.9176986023607792, 8: 0.9209935549279455, 9: 0.9046998334419581, 10: 0.9081758273589687, 11: 0.8947063509305525} 
 {1: 0.8741317032509031, 2: 0.8910808557932759, 3: 0.8318977493748263, 4: 0.8516254515143096, 5: 0.8074465129202556, 6: 0.8266185051403168, 7: 0.7852181161433731, 8: 0.8007779938871908, 9: 0.7693803834398444, 10: 0.7827174215059739, 11: 0.7524312308974715}


## The cell below is **NOT** to be removed
##### The function is to be amended so that it accepts the given input (dataframe) and returns the required output (list). 
##### It is recommended to test the function out prior to submission
-------------------------------------------------------------------------------------------------------------------------------
##### The hidden_data parsed into the function below will have the same layout columns wise as the dataset *SENT* to you
##### Thus, ensure that steps taken to modify the initial dataset to fit into the model are also carried out in the function below

In [9]:
def testing_hidden_data(hidden_data: pd.DataFrame) -> list:
    '''DO NOT REMOVE THIS FUNCTION.

The function accepts a dataframe as input and return an iterable (list)
of binary classes as output.

The function should be coded to test on hidden data
and should include any preprocessing functions needed for your model to perform. 
    
All relevant code MUST be included in this function.'''
    hidden_data = hidden_data.drop(columns=columns_to_drop)
    for column in float_columns:
        hidden_data[column] = pd.to_numeric(hidden_data[column], errors='coerce')
    for column in columns_to_encode:
        hidden_data[column] = pd.Categorical(hidden_data[column],categories = training_cats[column]).codes
    numerical_columns = hidden_data.select_dtypes(include='number')
    numerical_columns.replace(9999, 0, inplace=True)
    numerical_columns = numerical_columns.fillna(numerical_columns.mean())
    result = knn.predict(numerical_columns)
    return result

##### Cell to check testing_hidden_data function

In [10]:
# This cell should output a list of predictions.
test_df = pd.read_parquet(filepath)
test_df = test_df.drop(columns=["f_purchase_lh"])
print(testing_hidden_data(test_df))

[1. 1. 0. ... 0. 0. 0.]


### Please have the filename renamed and ensure that it can be run with the requirements above being met. All the best!