In [2]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE

In [4]:
train=pd.read_csv('dataset/train_set.csv')
test=pd.read_csv('dataset/test_set.csv')
sub=pd.read_csv('dataset/blinded_test_set.csv')

In [6]:
train_=train.drop(columns=['ID','CLASS'])
test_=test.drop(columns=['ID','CLASS'])
sub_=sub.drop(columns=['ID'])

In [8]:
def get_similar_value_cols(df, percent=90):
    """
    Finds columns where one value constitutes more than 'percent' of the data.
    Excludes columns with 2 or fewer unique values (like binary or constant columns).

    :param df: input data in the form of a dataframe (pandas DataFrame).
    :param percent: integer value for the threshold (e.g., 90 for 90%).
    :return: sim_val_cols: list of column names meeting the criteria.
    """
    sim_val_cols = []
    for col in df.columns:
        # Calculate value counts and their percentages
        # value_counts(normalize=True) gives proportions, multiply by 100 for percentage
        value_counts = df[col].value_counts(normalize=True) * 100

        # Check if there are any values in the column
        if not value_counts.empty:
            # Get the percentage of the most frequent value
            most_frequent_percent = value_counts.iloc[0]

            # Check if the most frequent value's percentage is > percent AND
            # if there are more than 2 unique values (to exclude binary and single-value columns)
            # len(value_counts.index) gives the number of unique values
            if most_frequent_percent > percent: #and len(value_counts.index) > 2:
                sim_val_cols.append(col)

    # Print the total count after the loop finishes
    print(f"Total columns found with a single value sharing > {percent}% : {len(sim_val_cols)}")

    return sim_val_cols

similar_cols = get_similar_value_cols(train_)
similar_cols_test = get_similar_value_cols(test_)
similar_cols_sub = get_similar_value_cols(sub_)

Total columns found with a single value sharing > 90% : 154
Total columns found with a single value sharing > 90% : 154
Total columns found with a single value sharing > 90% : 149


In [10]:
train_=train_.drop(columns=similar_cols)
test_=test_.drop(columns=similar_cols)
sub_=sub_.drop(columns=similar_cols)

In [12]:
categorical_threshold = 20
def cat_num(df):
    unique_counts = df.nunique()
    # Identify categorical columns
    categorical_cols = unique_counts[unique_counts <= categorical_threshold].index.tolist()
    
    # Identify numerical/continuous columns
    # These are the columns where the unique count is GREATER than the threshold.
    numerical_cols = unique_counts[unique_counts > categorical_threshold].index.tolist()
    
    #len(categorical_cols),len(numerical_cols)
    df_num = df[numerical_cols]
    df_cat = df[categorical_cols]
    return df_num,df_cat

train_1_num,train_1_cat=cat_num(train_)
test_1_num,test_1_cat=cat_num(test_)
sub_1_num,sub_1_cat=cat_num(sub_)

In [14]:
def fill_missing_values(num_df, cat_df):
    """
    Fills missing values in numerical columns with mean and
    in categorical columns with mode.
    
    Parameters:
    - num_df: DataFrame containing numerical columns
    - cat_df: DataFrame containing categorical columns
    
    Returns:
    - cleaned_df: Combined DataFrame with missing values filled
    """
    # Fill numerical columns
    for col in num_df.columns:
        if num_df[col].isna().any():
            num_df[col] = num_df[col].fillna(num_df[col].mean())
    
    # Fill categorical columns
    for col in cat_df.columns:
        if cat_df[col].isna().any():
            cat_df[col] = cat_df[col].fillna(cat_df[col].mode()[0])
    
    # Combine back numerical and categorical DataFrames
    cleaned_df = pd.concat([num_df, cat_df], axis=1)
    
    return cleaned_df

# For training set
train_1_cleaned = fill_missing_values(train_1_num, train_1_cat)

# For test set
test_1_cleaned = fill_missing_values(test_1_num, test_1_cat)

sub_1_cleaned = fill_missing_values(sub_1_num, sub_1_cat)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  num_df[col] = num_df[col].fillna(num_df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df[col] = cat_df[col].fillna(cat_df[col].mode()[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  num_df[col] = num_df[col].fillna(num_df[col].mean())
A value is trying to be set on a copy of a 

In [16]:
def fill_inf(df):
# Example: Create a boolean DataFrame where True = inf or -inf
    inf_mask = np.isinf(df)
    inf_positions = [(row_idx, col) for row_idx, row in inf_mask.iterrows() for col in df.columns if row[col]]
    #print(inf_positions)
    
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(df.mean(), inplace=True)
    return df    

In [18]:
train_1_cleaned_inf=fill_inf(train_1_cleaned)
test_1_cleaned_inf=fill_inf(test_1_cleaned)
sub_1_cleaned_inf=fill_inf(sub_1_cleaned)

In [26]:
def cap_outliers(df):
    df_capped = df.copy()  # Avoid modifying original data
    
    # Iterate over numeric columns
    for col in df_capped:
        Q1 = df_capped[col].quantile(0.25)
        Q3 = df_capped[col].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Cap the values
        df_capped[col] = np.where(df_capped[col] < lower_bound, lower_bound, df_capped[col])
        df_capped[col] = np.where(df_capped[col] > upper_bound, upper_bound, df_capped[col])
    
    return df_capped

train_1_num,train_1_cat_n=cat_num(train_1_cleaned_inf)
test_1_num,test_1_cat_n=cat_num(test_1_cleaned_inf)
# Apply to your DataFrame
train_1_num_capped = cap_outliers(train_1_num)
test_1_num_capped = cap_outliers(test_1_num)

sub_1_num,sub_1_cat_n=cat_num(sub_1_cleaned_inf)
# Apply to your DataFrame
sub_1_num_capped = cap_outliers(sub_1_num)


# Check capped data
# print(train_1_capped.describe())


# If you want to recombine:
train_1_capped = pd.concat([train_1_num_capped, train_1_cat_n], axis=1)
test_1_capped = pd.concat([test_1_num_capped, test_1_cat_n], axis=1)

sub_1_capped = pd.concat([sub_1_num_capped, sub_1_cat_n], axis=1)
# Check results
#print(train_1_capped.head())


In [28]:
def rem_high_corr(df):
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    #df.drop(columns=to_drop, inplace=True)
    return to_drop
columns_drop=rem_high_corr(train_1_capped)
train_exp=train_1_capped.drop(columns=columns_drop)
test_exp=test_1_capped.drop(columns=columns_drop)
sub_exp=sub_1_capped.drop(columns=columns_drop)

In [34]:
sub_exp

Unnamed: 0,Feature_1,Feature_4,Feature_5,Feature_10,Feature_11,Feature_13,Feature_14,Feature_15,Feature_16,Feature_17,...,Feature_2737,Feature_2775,Feature_2776,Feature_2786,Feature_2789,Feature_2850,Feature_2853,Feature_2862,Feature_2865,Feature_2881
0,13249.25,0.40169,0.018668,7.621183,95.817535,17.527215,14.711833,0.169403,0.142192,0.11061,...,1.0,0.957977,1.282051,0.901381,0.122288,1.0,0.0,1.0,1.0,1.0
1,60593.666667,0.351976,0.010976,38.462982,270.475507,69.042008,33.614694,0.211285,0.102869,0.052173,...,5.0,1.0,1.0,1.0,0.0,1.0,0.0,3.8,5.0,3.0
2,51978.833333,0.376583,0.010708,25.820516,243.400904,41.875408,29.362187,0.152257,0.10676,0.076363,...,7.0,1.0,1.0,1.0,0.0,1.0,0.0,1.333333,9.0,1.0
3,47737.416667,0.361293,0.011891,62.531559,277.600432,60.896711,22.965048,0.187515,0.070715,0.060928,...,4.0,1.0,1.0,1.0,1.2325950000000001e-32,1.0,0.0,3.833333,6.0,4.0
4,33029.458333,0.481423,0.009294,11.310782,246.440662,54.026198,19.434182,0.185228,0.06663,0.044668,...,7.0,1.0,1.0,1.0,1.2325950000000001e-32,1.0,0.0,2.0,3.0,1.0
5,11367.875,0.490432,0.015609,31.255069,108.850868,18.340511,12.408828,0.171751,0.115287,0.096185,...,3.5,1.0,1.0,1.0,0.0,1.0,0.0,3.583333,2.5,1.5
6,38157.416667,0.42159,0.010551,43.287033,226.603177,34.522569,24.103503,0.130162,0.090878,0.103534,...,4.0,1.0,1.0,1.0,0.0,1.0,0.0,3.75,4.0,2.0
7,11407.166667,0.476839,0.016178,30.080752,125.907471,18.807297,11.758521,0.150351,0.092024,0.096998,...,3.5,1.0,1.0,1.0,6.162976000000001e-33,1.0,0.0,4.666667,3.0,2.0
8,30688.166667,0.424989,0.011624,2.720334,202.29434,33.334118,17.332148,0.137276,0.071377,0.111852,...,5.0,1.0,1.0,1.0,0.0,1.0,0.0,3.8,5.0,3.0
9,41047.041667,0.414955,0.010418,48.224389,248.714696,30.943447,28.513152,0.10206,0.094044,0.082065,...,5.0,1.0,1.0,1.0,0.0,1.0,0.0,3.8,5.0,3.0


In [50]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
common_columns = train_exp.columns.intersection(test_exp.columns)
#common_columns = list(common_columns)

# Remove the column
# if 'Feature_100' in common_columns:
#     common_columns.remove('Feature_100')
X_alig = train_exp[common_columns]
X_test_alig = test_exp[common_columns]
X_sub_alig = sub_exp[common_columns]


scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_alig)
X_test_s = scaler.transform(X_test_alig)
X_sub_s = scaler.transform(X_sub_alig)


pca = PCA(n_components=50)  # retain 95% variance
X_train_sel = pca.fit_transform(X_train_s)
X_test_sel=pca.transform(X_test_s) 
X_sub_sel=pca.transform(X_sub_s) 
#X_org_sel=pca.transform(X_org_s) 
print(f"Reduced features: {X_train_sel.shape[1]}")

Reduced features: 50


In [70]:
import joblib
rf = joblib.load('models/random_forest.pkl')
stacking_clf = joblib.load('models/stacking_clf.pkl')
xgb = joblib.load('models/xgboost.pkl')


In [83]:
sub

Unnamed: 0,ID,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,...,Feature_3229,Feature_3230,Feature_3231,Feature_3232,Feature_3233,Feature_3234,Feature_3235,Feature_3236,Feature_3237,Feature_3238
0,ID_101,13249.25,13323.0,5322.087891,0.40169,0.019253,0.131701,1.965488,0.50878,0.965488,...,453.349939,453.349939,1646.0,1.0,162.029162,0.098438,0.0,30.580378,3.888605,0.098438
1,ID_102,60593.666667,60804.0,21327.521484,0.351976,0.010976,0.042804,2.858719,0.349807,1.858719,...,492.250478,492.250478,7853.0,1.0,961.759455,0.12247,0.0,26.690038,3.695084,0.12247
2,ID_103,51978.833333,52193.0,19574.339844,0.376583,0.010708,0.040742,2.906154,0.344097,1.906154,...,482.387417,482.387417,6644.0,1.0,763.046057,0.114847,0.0,30.037774,3.804517,0.114847
3,ID_104,47737.416667,47943.0,17247.173828,0.361293,0.011891,0.050236,2.710158,0.368982,1.710158,...,475.620243,475.620243,6017.0,1.0,718.741732,0.119452,0.0,27.964103,3.69986,0.119452
4,ID_105,33029.458333,33261.0,15901.136719,0.481423,0.009294,0.030688,3.19406,0.313081,2.19406,...,417.949466,417.949466,4116.0,1.0,314.568513,0.076426,0.0,31.80214,4.078748,0.076426
5,ID_106,11367.875,11452.0,5498.018555,0.490432,0.015609,0.086609,2.261276,0.442334,1.261276,...,410.847421,410.847421,1436.5,1.0,94.88406,0.064771,0.0,29.473132,4.164385,0.064771
6,ID_107,38157.416667,38349.0,16086.787109,0.42159,0.010551,0.039555,2.93494,0.340722,1.93494,...,443.259708,443.259708,4790.0,1.0,417.063466,0.08707,0.0,27.729782,3.924945,0.08707
7,ID_108,11407.166667,11494.0,5457.56665,0.476839,0.016178,0.094018,2.220773,0.452508,1.220773,...,409.418166,409.418166,1432.0,1.0,99.172963,0.069607,0.0,27.6498,4.068913,0.069607
8,ID_109,30688.166667,30858.0,13042.123047,0.424989,0.011624,0.048012,2.751378,0.363454,1.751378,...,452.841584,452.841584,3737.0,1.0,343.150388,0.091825,0.0,29.904804,3.929448,0.091825
9,ID_110,41047.041667,41269.0,17032.671875,0.414955,0.010418,0.038563,2.959903,0.337849,1.959903,...,460.322568,460.322568,5202.0,1.0,526.119569,0.101138,0.0,31.668608,3.910294,0.101138


In [85]:
len(probs_sub_s)

100

In [91]:
# Predict probabilities
# (assuming binary classification: columns for class 0 and 1)

def prediction(train,test,sub,model):
    probs_train = model.predict_proba(train)
    probs_test = model.predict_proba(test)
    probs_sub = model.predict_proba(sub)
    return probs_train,probs_test,probs_sub
    
    
probs_train_rf,probs_test_rf,probs_sub_rf = prediction(X_train_sel,X_test_sel,X_sub_sel,rf)
probs_train_s,probs_test_s,probs_sub_s = prediction(X_train_s,X_test_s,X_sub_s,stacking_clf)
probs_train_x,probs_test_x,probs_sub_x = prediction(X_train_s,X_test_s,X_sub_s,xgb)




    # Create a DataFrame for submission
def dataframe_create(probs_train,probs_test,probs_sub):
    df_submission_train = pd.DataFrame(probs_train, columns=['prob_class_0', 'prob_class_1'])
    df_submission_test = pd.DataFrame(probs_test, columns=['prob_class_0', 'prob_class_1'])
    df_submission_sub = pd.DataFrame(probs_sub, columns=['prob_class_0', 'prob_class_1'])
    
    # Add ID column
    
    df_submission_train['ID'] = train['ID'].values
    
    df_submission_test['ID'] = test['ID'].values
    df_submission_sub['ID'] = sub['ID'].values
    
    # Move 'ID' to first column
    df_submission_train = df_submission_train[['ID', 'prob_class_0', 'prob_class_1']]
    df_submission_test = df_submission_test[['ID', 'prob_class_0', 'prob_class_1']]
    df_submission_sub = df_submission_sub[['ID', 'prob_class_0', 'prob_class_1']]

    return df_submission_train,df_submission_test,df_submission_sub

df_submission_train_rf,df_submission_test_rf,df_submission_sub_rf=dataframe_create(probs_train_rf,probs_test_rf,probs_sub_rf)
df_submission_train_s,df_submission_test_s,df_submission_sub_s=dataframe_create(probs_train_s,probs_test_s,probs_sub_s)
df_submission_train_x,df_submission_test_x,df_submission_sub_x=dataframe_create(probs_train_x,probs_test_x,probs_sub_x)

# Save to CSV
def save(df_submission_train,df_submission_test,df_submission_sub,ext):
    df_submission_train.to_csv(f'{ext}train_pred.csv', index=False)
    df_submission_test.to_csv(f'{ext}test_pred.csv', index=False)
    df_submission_sub.to_csv(f'{ext}submission_pred.csv', index=False)
    print("Submission file created")

save(df_submission_train_rf,df_submission_test_rf,df_submission_sub_rf,'rf')
save(df_submission_train_s,df_submission_test_s,df_submission_sub_s,'stack')
save(df_submission_train_x,df_submission_test_x,df_submission_sub_x,'xg')

Submission file created
Submission file created
Submission file created
