In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import cohen_kappa_score, accuracy_score
import os

def load_tabular_data(train_path, test_path, dict_path):
    """
    Load the training, test, and dictionary data from the given paths.
    """
    X_train = pd.read_csv(train_path)
    X_test = pd.read_csv(test_path)
    data_dict = pd.read_csv(dict_path)
    
    return X_train, X_test, data_dict

def add_series_features(X, series_path):
    """
    Add features from the series data to the input data.
    """
    X[['enmo_mean', 'enmo_std', 'light_mean', 'light_std']] = np.nan

    for id in X['id'].values:
        if not os.path.exists(f'{series_path}/id={id}'):
            continue
        
        df_series = pd.read_parquet(f'{series_path}/id={id}', engine='pyarrow')
        X.loc[X['id'] == id, 'enmo_mean'] = df_series['enmo'].mean()
        X.loc[X['id'] == id, 'enmo_std'] = df_series['enmo'].std()
        X.loc[X['id'] == id, 'light_mean'] = df_series['light'].mean()
        X.loc[X['id'] == id, 'light_std'] = df_series['light'].std()
    
    return X    

def evaluate_model(model, X, y):
    """
    Evaluate the given model using quadratic weighted kappa and mean accuracy.
    """
    y_pred = model.predict(X)
    kappa = cohen_kappa_score(y, y_pred, weights='quadratic')
    mean_accuracy = accuracy_score(y, y_pred)
    
    return kappa, mean_accuracy

def get_labeled_subset(data):
    """
    Get all data points with labels from a given train set.
    """
    data = data.dropna(subset=["sii"])
    return data 

df_train, df_test, data_dict = load_tabular_data('baseline_train.csv','baseline_test.csv', 'data_dictionary.csv')
# Prepare train and test matrices
columns_not_in_test = list(set(df_train.columns).difference(set(df_test.columns)))
X_train = df_train.drop(columns_not_in_test + ['id'], axis=1)
y_train = df_train['sii']

print(X_train.shape), print(y_train.shape)

X_test = df_test.drop(columns_not_in_test + ['id'], axis=1)
y_test = df_test['sii']

print(X_test.shape)
print(y_test.shape)


(2188, 64)
(2188,)
(548, 64)
(548,)


In [8]:

# Identify categorical and numerical columns
numerical_features = data_dict[(data_dict['Type'] == 'float') | (data_dict['Type'] == 'int')]['Field'].values
numerical_features = [feature for feature in numerical_features if feature in X_train.columns]
categorical_features = data_dict[(data_dict['Type'] == 'str') | (data_dict['Type'] == 'categorical int')]['Field'].values
categorical_features = [feature for feature in categorical_features if feature in X_train.columns]

# Preprocess features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)



In [14]:
from sklearn.impute import SimpleImputer

# Create an imputer to fill NaN values with the mean
imputer = SimpleImputer(strategy="mean")

# Impute the missing values in X_train
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed= imputer.fit_transform(X_test)
# Now fit your LogisticRegression model with the imputed data


In [16]:
# Define Logistic Regression model with Ridge regularization
logistic_model = LogisticRegression(solver='saga', penalty='l2', C=1.0, max_iter=1000, random_state=42, multi_class='multinomial')

# Fit the model on training data
logistic_model.fit(X_train_imputed, y_train)

# Evaluate the model
kappa, accuracy = evaluate_model(logistic_model, X_test_imputed, y_test)
print(f'Logistic Regression model accuracy: {accuracy}, kappa: {kappa}')


Logistic Regression model accuracy: 0.5821167883211679, kappa: 0.0




In [None]:

# Make predictions on Kaggle test data
kaggle_test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
kaggle_test = add_series_features(kaggle_test, 'series_test.parquet')
id_column = kaggle_test['id']
kaggle_test = preprocessor.transform(kaggle_test.drop(columns_not_in_test + ['id'], axis=1))

predictions = logistic_model.predict(kaggle_test)
submission = pd.DataFrame({'id': id_column, 'sii': predictions})
submission.to_csv('/kaggle/working/submission.csv', index=False)
