In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sample_submission.csv
/kaggle/input/train (4).csv
/kaggle/input/train_labels.csv
/kaggle/input/test.csv


In [2]:
train_df = pd.read_csv("/kaggle/input/train (4).csv")
print(f"Training data shape: {train_df.shape}")
print(f"Columns: Id + {train_df.shape[1]-2} gene features + Class")

Training data shape: (400, 14574)
Columns: Id + 14572 gene features + Class


In [3]:
train_labels_df = pd.read_csv('/kaggle/input/train_labels.csv')
print(f"Labeled training data shape: {train_labels_df.shape}")

Labeled training data shape: (150, 2)


In [4]:
labeled_count = train_df['Class'].notna().sum()
unlabeled_count = train_df['Class'].isna().sum()
print(f"\nIn train.csv:")
print(f"Labeled samples: {labeled_count}")
print(f"Unlabeled samples: {unlabeled_count}")


In train.csv:
Labeled samples: 150
Unlabeled samples: 250


In [5]:
test_df = pd.read_csv("/kaggle/input/test.csv")

In [7]:
#Class distribution in labeled data

print(f"\nClass distribution in labeled data:")
print(train_labels_df['Class'].value_counts().sort_index())


Class distribution in labeled data:
Class
0    26
1    26
2    56
3    15
4    27
Name: count, dtype: int64


In [9]:
train1 = train_df.merge(train_labels_df, on="Id", how="left")

In [13]:
# 💡 Features and labels
X = train_df.drop(columns=["Id", "Class"])
y = train_df["Class"]
X_test = test_df.drop(columns=["Id"])

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [17]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [18]:
labeled_mask = ~y.isna()
X_labeled = X_scaled[labeled_mask]
y_labeled = y[labeled_mask].astype(int)
X_unlabeled = X_scaled[~labeled_mask]


In [19]:
base_model = RandomForestClassifier(n_estimators=100, random_state=42)
ssl_model = SelfTrainingClassifier(base_model, criterion='k_best', k_best=100)


In [20]:
X_train, X_val, y_train, y_val = train_test_split(
    X_labeled, y_labeled, test_size=0.2, random_state=42, stratify=y_labeled
)

In [21]:
X_combined = np.concatenate([X_train, X_unlabeled])
y_combined = np.concatenate([y_train, [-1] * len(X_unlabeled)])

In [22]:
from sklearn.impute import SimpleImputer

In [23]:
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X_test_imputed = imputer.transform(X_test)

In [24]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

In [25]:
X_combined = np.concatenate([X_train, X_unlabeled])
y_combined = np.concatenate([y_train, [-1] * len(X_unlabeled)])


In [26]:
from sklearn.impute import SimpleImputer

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
X_combined = imputer.fit_transform(X_combined)


In [27]:
from sklearn.impute import SimpleImputer

# Fill NaNs with mean of each gene feature
imputer = SimpleImputer(strategy='mean')
X_combined = imputer.fit_transform(X_combined)

# Now fit your semi-supervised or ensemble model
ssl_model.fit(X_combined, y_combined)


In [28]:
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score

# 1. Impute validation data using same imputer
X_val_imputed = imputer.transform(X_val)

# 2. Predict and compute F1 score
y_val_pred = ssl_model.predict(X_val_imputed)
f1 = f1_score(y_val, y_val_pred, average='macro')

print(f"📊 Macro F1 Score on validation set: {f1:.4f}")


📊 Macro F1 Score on validation set: 1.0000


In [29]:
# Impute with NumPy arrays
X_test_imputed = imputer.transform(X_test.values)


In [30]:
# Refit imputer using a DataFrame with feature names
imputer = SimpleImputer(strategy='mean')
imputer.fit(X_combined)  # X_combined_df is a DataFrame with column names

# Then transform with test DataFrame
X_test_imputed = imputer.transform(X_test)  # no warning now




In [31]:
# Get test IDs
test_ids = test_df["Id"]

# Impute missing values in test data
X_test_imputed = imputer.transform(X_test)

# Predict test labels
test_predictions = ssl_model.predict(X_test_imputed)

# Create submission DataFrame
submission = pd.DataFrame({
    'Id': test_ids,
    'Class': test_predictions
})





In [32]:
# Save to CSV
submission.to_csv("submission.csv", index=False)

print("✅ Submission file created: submission.csv")
print(f"Submission shape: {submission.shape}")


✅ Submission file created: submission.csv
Submission shape: (401, 2)
