In [1]:
# Import the required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from scipy.stats.mstats import winsorize

In [2]:
def load_data(file_path, chunk_size=10000):
    " small chunk size for small memory capacity"
    
    data_frames = []
    df_iterator = pd.read_csv(file_path, chunksize=chunk_size)

    for i in df_iterator:
        data_frames.append(i)

    combined_df = pd.concat(data_frames, ignore_index=True)
    return combined_df


In [3]:
def handle_binary_variables(df):
    "Replace values greater than 1 with 1 for binary columns"
    
    binary_cols = [col for col in df.columns if col.startswith('b')]

    for col in binary_cols:
        df[col] = np.where(df[col] > 1, 1, df[col])

    return df


In [4]:
def handle_missing_values(df):
    "Impute missing values using different strategies based on column types "
    
    categorical_cols = [col for col in df.columns if col.startswith('c')]
    ordinal_cols = [col for col in df.columns if col.startswith('o')]
    numerical_cols = [col for col in df.columns if col.startswith('n')]
    binary_cols = [col for col in df.columns if col.startswith('b') and col != 'b17']

    # Remove 'Id' and 'b17' from numerical columns
    numerical_cols = [col for col in numerical_cols if col not in ['Id', 'b17']]

    # Create imputers
    numerical_imputer = SimpleImputer(strategy='mean')
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    ordinal_imputer = SimpleImputer(strategy='most_frequent')
    binary_imputer = SimpleImputer(strategy='constant', fill_value=0)

    # Impute 'c9' with 'Unknown' and remove from categorical columns
    if 'c9' in categorical_cols:
        df['c9'].fillna('Unknown', inplace=True)
        categorical_cols.remove('c9')

    # Apply imputers to appropriate columns
    df[numerical_cols] = numerical_imputer.fit_transform(df[numerical_cols])
    df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])
    df[ordinal_cols] = ordinal_imputer.fit_transform(df[ordinal_cols])
    df[binary_cols] = binary_imputer.fit_transform(df[binary_cols])

    return df

In [5]:
def handle_outliers(df, columns):
    "Handle outliers using Winsorization"
    
    for col in columns:
        df[col] = winsorize(df[col], limits=[0.05, 0.05])  # Winsorize at 5th and 95th percentile remove outlier present 
                                                           # in extreme level
    return df


In [6]:
def normalize_numeric(df, columns):
    " Normalize numerical columns using StandardScaler "
    
    scaler = StandardScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df


In [7]:

def encode_categorical(df, categorical_cols):
    " Encode categorical columns using OneHotEncoder"
    
    encoder = OneHotEncoder(drop='first', sparse=False)
    df_encoded = pd.DataFrame(encoder.fit_transform(df[categorical_cols]))
    df_encoded.columns = encoder.get_feature_names(categorical_cols)
    df = df.drop(categorical_cols, axis=1)
    df = pd.concat([df, df_encoded], axis=1)
    return df

In [8]:
def main():
    # Load data
    file_path_train = 'assessment/train_df.csv'
    file_path_pred = 'assessment/pred_df.csv'
    train_df = load_data(file_path_train)
    pred_df = load_data(file_path_pred)

    # Handle binary variables
    train_df = handle_binary_variables(train_df)
    pred_df = handle_binary_variables(pred_df)

    # Handle missing values
    train_df = handle_missing_values(train_df)
    pred_df = handle_missing_values(pred_df)

    # Handle outliers
    numerical_cols = [col for col in train_df.columns if col.startswith('n')]
    train_df = handle_outliers(train_df, numerical_cols)
    pred_df = handle_outliers(pred_df, numerical_cols)

    # Normalize numerical columns
    train_df = normalize_numeric(train_df, numerical_cols)
    pred_df = normalize_numeric(pred_df, numerical_cols)

    # Encode categorical columns
    categorical_cols = [col for col in train_df.columns if col.startswith('c')]
    train_df = encode_categorical(train_df, categorical_cols)
    pred_df = encode_categorical(pred_df, categorical_cols)

    # Split data into features and target
    features_train = train_df.drop(['Id', 'b17'], axis=1)
    target_train = train_df['b17']
    features_pred = pred_df.drop(['Id'], axis=1)

    # Train the model
    clf = RandomForestClassifier(random_state=42)
    clf.fit(features_train, target_train)

    # Make predictions
    y_pred = clf.predict(features_pred)

    # Prepare submission DataFrame
    submission = pd.DataFrame({'id': pred_df['Id'], 'b17': y_pred})

    # Save submission to CSV
    submission.to_csv('submission.csv', index=False)

    # Evaluate model performance
    y_train_pred = clf.predict(features_train)
    print(f"F1-Score: {f1_score(target_train, y_train_pred):.4f}")

if __name__ == "__main__":
    main()

MemoryError: Unable to allocate 1.30 TiB for an array with shape (2351118, 76011) and data type float64