# Train and Test the Training Data!
As we discovered in the exploration phase, the kids with an sii score all have taken a questionaire and have had their score taken from that, which leaves the rest of the training data with the kids that don't have sii as they didn't take questionaire or rather their parents did not.<br>
As such we are left with a real training set of 2736 which is not huge, so we'll use cross-validation. <br>
Instinct is to use a nearest neighbours approach, so let's see how that goes.

In [2]:
#set up
import polars as pl
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score


pl.Config.set_tbl_rows(50);
pl.Config.set_tbl_width_chars(300);
pl.Config.set_tbl_cols(90);

In [130]:
data = pl.read_csv('data/train.csv')

#Remove the questionaire columns form training
dropCol = [col for col in data.columns if "PCIAT" in col]
data = data.drop(dropCol)
seasonCol = [col for col in data.columns if "Season" in col]
#data = data.drop(seasonCol)

train = data.filter(pl.col('sii').is_not_null()).fill_null(-1).fill_null("None")

## KNN: Optimal value of K

In [5]:
#but together a standard pipline
clf = Pipeline(
    steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_neighbors=11))]
)

In [7]:
# Split the data into training and testing sets
Xtrain, Xtest, ytrain, ytest = train_test_split(train.drop('id','sii'), train.select('sii').to_numpy().flatten(), test_size=0.2, random_state=42)

# Define a range of k values to test
kValues = list(range(3,30))

#Dict to record testing in
kValueDict = {}

# Train and visualize the models with varying k values
for k in kValues:
    # Create KNN model
    #but together a standard pipline
    clf = Pipeline(steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_neighbors=k))])

    # Train the model
    clf.fit(Xtrain, ytrain)

    # Make predictions
    ypred = clf.predict(Xtest)

    # Calculate accuracy
    accuracy = accuracy_score(ytest, ypred)

    kValueDict[k] = accuracy



We found that 4 was the optimal number of neighbours. So let's have a base test.

## Create Model & Test

In [10]:
#train the model
clf = Pipeline(steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_neighbors=4))])
clf.fit(Xtrain, ytrain)

In [12]:
#get the test data'
testData = pl.read_csv('data/test.csv')

#Remove the questionaire columns form training
dropCol = [col for col in testData.columns if "PCIAT" in col or "Season" in col]
#dropSeasonCol = [col for col in data.columns if "Season" in col]
test = testData.drop(dropCol).fill_null(-1).fill_null("None")

In [14]:
y_pred = clf.predict(test.drop('id'))

In [None]:
testData.with_columns(pl.Series(y_pred).alias('sii')).select('id', 'sii').write

id,sii
str,i64
"""00008ff9""",0
"""000fd460""",0
"""00105258""",0
"""00115b9f""",0
"""0016bb22""",2
"""001f3379""",0
"""0038ba98""",0
"""0068a485""",0
"""0069fbed""",0
"""0083e397""",0


Dispite our 0.61 accuracy in our testing we only got 0.282 in the real training which suggests that the survey kids alone do not reflect the testing set well. Not overly suprising given how few observation pionts we have. So let's do a bit of unsupervised clustering and see where all our assigned training data ends up.

### Clustering: All Data
Clustering is going poorly, i.e. not clustable. At least not the test and non-test observations together. We got an alright test only cluster.

In [163]:
from sklearn.cluster import DBSCAN, AffinityPropagation, MeanShift, AgglomerativeClustering, Birch, KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
import numpy as np

In [114]:
#Also want to get rid of a few more things in training
#dropCol = [x for x in train.columns if ('FGC-FGC' in x) & ('Zone' not in x)]
trainReduce = data#.drop(dropCol).select('id', 'Basic_Demos-Age', 'Basic_Demos-Sex', 'CGAS-CGAS_Score', 'SDS-SDS_Total_T', 'PreInt_EduHx-computerinternet_hoursday', 'sii')

In [115]:
#traing data
trainingClust = trainReduce.drop('id', 'sii').fill_null(-1).to_pandas()

In [125]:
clf = Pipeline(steps=[("scaler", StandardScaler()), ("cluster", Birch(threshold=0.1))])
labels = clf.fit_predict(trainingClust)

In [112]:
partition = trainReduce.with_columns(pl.Series(labels).alias('Labels')).partition_by('Labels');

In [126]:
(trainReduce
    .with_columns(pl.Series(labels).alias('Labels'))
    .group_by('Labels')
    .agg(pl.col('sii').filter(pl.col('sii').is_null()).len().alias('NullSii'),
         pl.col('sii').len().alias('TotalCount'), 
         pl.col('sii').filter(pl.col('sii').is_not_null()).mode().alias('Mode'),
         pl.col('sii').filter(pl.col('sii').is_not_null()).mean().alias('Mean'))
    .sort('TotalCount')
)

Labels,NullSii,TotalCount,Mode,Mean
i64,u32,u32,list[i64],f64
1,0,1,[0],0.0
2,823,1659,[0],0.625598
0,401,2300,[0],0.560821


### Let's look at some feature and other preprocessing methods

In [134]:
from sklearn.preprocessing import OrdinalEncoder

In [138]:
trainData2 = data.drop('id', 'sii')

In [178]:
#Let's get all the season columns encoding
clf = Pipeline(steps=[("encoder", OrdinalEncoder(encoded_missing_value=-1)), ("scaler", StandardScaler()), ("cluster", KMeans(n_clusters=80))])
labels = clf.fit_predict(trainData2)

In [None]:
(data
    .with_columns(pl.Series(labels).alias('Labels'))
    .group_by('Labels')
    .agg(pl.col('sii').filter(pl.col('sii').is_null()).len().alias('NullSii'),
         pl.col('sii').len().alias('TotalCount'), 
         pl.col('sii').filter(pl.col('sii').is_not_null()).mode().alias('Mode'),
         pl.col('sii').filter(pl.col('sii').is_not_null()).mean().alias('Mean'))
    .sort('TotalCount')
)

Labels,NullSii,TotalCount,Mode,Mean
i32,u32,u32,list[i64],f64
38,0,13,[1],0.769231
74,9,13,[1],1.0
59,14,16,[0],0.0
48,0,18,[0],0.777778
75,4,18,[0],0.857143
56,20,23,[0],0.333333
79,0,25,[0],0.6
21,2,26,[1],1.041667
57,0,26,[0],0.653846
14,22,27,"[2, 1]",1.2


: 