In [19]:
# importing all the libraries needed
print("Started Importing")
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
print("import finished")  #have added a lot of print commnad cuz i needed updates and sometimes it took mintues to get results
#tried XGB Boost but didn't find better results. It started to reduce the score actually. 
#Maybe cuz I gave very less time to work with it.

Started Importing
import finished


In [20]:
# loading all the files given to work with them using pandas
print("Loading data...")
try:
    train_df = pd.read_csv("train.csv")
    train_labels_df = pd.read_csv("train_labels.csv")
    test_df = pd.read_csv("test.csv")
    print("Data loaded successfully.")
    
except FileNotFoundError as error_shown:
    # making sure the program doesn't end with an error code
    print(f"Error loading file: {error_shown}.Ensure the files are in the same directory.")
    exit()



Loading data...
Data loaded successfully.


In [21]:
# working with the dataframes created above

#checking if drain_df containes 'Class' to remove that coulmn from the dataframe
for i in train_df.columns:
    if i == 'Class':    # to check if one the column name is Class
        train_df = train_df.drop(columns=['Class'])
        
# merging training data with the labels(gene data and cancer classes) present in another file
# using 'Id' as it is present in both cases

train_merged_df = pd.merge(train_df, train_labels_df, on='Id', how='left')

# Checking for NaNs in the 'Class' column after merge as there were some NaNs values causing errors in the results.
initial_rows = len(train_merged_df)  # counting the number of initial tuples available in the training data

# removed all the rows from the training data that had a NaN value and checked the number of rows in dataframe
train_merged_df.dropna(subset=['Class'], inplace=True) 
rows_after_drop = len(train_merged_df)

if initial_rows > rows_after_drop:
    print(f"Warning: Dropped {initial_rows - rows_after_drop} rows from training data due to missing 'Class' labels.")
else:
    print("No missing 'Class' labels found in training data after merge.")
    
#we create an empty list that stores the names of columns and then loop through df
#then checking if it begins with 'gene' and if it does it is appended to the list
#using these to form X_train and Y_train

columns_list = []
for columns in train_merged_df.columns:
    if columns.startswith('gene_'):
        columns_list.append(columns)

X_train = train_merged_df[columns_list]
y_train = train_merged_df['Class']

if test_df.empty:
    print("Error: test.csv is empty. Cannot create X_test.")
    exit()
X_test = test_df[columns_list] 

#printing info
print(f"Training features shape: {X_train.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Test data shape: {X_test.shape}")


# creating a preprocessing and modeling pipeline

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')), # Impute missing values with the mean
    ('scaler', StandardScaler()),                 # Scale features to have zero mean and unit variance
    ('classifier', RandomForestClassifier(n_estimators=200, # Initially it was 100 but got better results with 200
                                         class_weight='balanced', # Added to handle class imbalance
                                         random_state=42,
                                         n_jobs=-1))])



Training features shape: (150, 14572)
Training target shape: (150,)
Test data shape: (401, 14572)


In [22]:
#training the model
print("Training model...")
pipeline.fit(X_train, y_train)
print("Model training complete.")



Training model...
Model training complete.


In [23]:
#creating predictions and generating submission data
print("Generating predictions on test data...")
predictions = pipeline.predict(X_test)
print("Predictions generated.")

print("Creating submission file...")
submission_df = pd.DataFrame({'Id': test_df['Id'], 'Class': predictions})

# making sure that the datatype of 'Class' is integer
submission_df['Class'] = submission_df['Class'].astype(int)

# Save the submission file
submission_df.to_csv("submission.csv", index=False)

print(f"Submission file '{submission_file_name}' created successfully.")

Generating predictions on test data...
Predictions generated.
Creating submission file...
Submission file 'submission.csv' created successfully.
