In [3]:
from google.colab import files
import pandas as pd

# Automatically upload files
uploaded = files.upload()

# Once files are uploaded, you can load them
train_df = pd.read_excel('train_data.xlsx')
test_df = pd.read_excel('test_data.xlsx')


Saving test_data.xlsx to test_data.xlsx
Saving train_data.xlsx to train_data.xlsx


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from google.colab import files

# Automatically upload files (only works in Google Colab)
uploaded = files.upload()

# Load the datasets after upload
train_df = pd.read_csv('train_data.csv')  # Ensure the file name matches the uploaded file
test_df = pd.read_csv('test_data.csv')    # Ensure the file name matches the uploaded file

# Display the first few rows to check if the files are loaded properly
print("Training data preview:")
print(train_df.head())
print("\nTesting data preview:")
print(test_df.head())

# Handle missing values and split the dataset into features (X) and target (y)
train_data = train_df.dropna()  # Remove rows with missing values (you can also fill them)
X_train = train_data.drop(columns=['target'])  # Replace 'target' with your actual target column name
y_train = train_data['target']  # Replace 'target' with your actual target column name

# Preprocess the test data similarly
test_data = test_df.dropna()
X_test = test_data.drop(columns=['target'])  # Replace 'target' with your actual target column name
y_test = test_data['target']

# Model selection menu for the user
model_type = input("Select model type ('logistic_regression' or 'random_forest'): ").strip().lower()

# Initialize selected model class
if model_type == 'logistic_regression':
    model = LogisticRegression(max_iter=1000)
elif model_type == 'random_forest':
    model = RandomForestClassifier(random_state=42)
else:
    raise ValueError("Invalid model type selected. Please choose either 'logistic_regression' or 'random_forest'.")

# Hyperparameter tuning
if model_type == 'random_forest':
    print("Performing hyperparameter tuning for Random Forest...")
    param_grid_rf = {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    grid_search_rf = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid_rf, cv=3, verbose=2, n_jobs=-1)
    grid_search_rf.fit(X_train, y_train)
    best_rf_model = grid_search_rf.best_estimator_
    model = best_rf_model
    print(f"Best parameters for Random Forest: {grid_search_rf.best_params_}")

elif model_type == 'logistic_regression':
    print("Performing hyperparameter tuning for Logistic Regression...")
    param_grid_lr = {
        'C': [0.1, 1, 10],
        'solver': ['liblinear', 'saga']
    }
    grid_search_lr = GridSearchCV(estimator=LogisticRegression(max_iter=1000), param_grid=param_grid_lr, cv=3, verbose=2, n_jobs=-1)
    grid_search_lr.fit(X_train, y_train)
    best_lr_model = grid_search_lr.best_estimator_
    model = best_lr_model
    print(f"Best parameters for Logistic Regression: {grid_search_lr.best_params_}")

# Train the selected model
print(f"Training the {model_type} model...")
model.fit(X_train, y_train)

# Test the model and evaluate performance
print(f"Evaluating the {model_type} model...")
y_pred = model.predict(X_test)

# Display the classification report and accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Save the trained model
save_model = input("Do you want to save the trained model? (yes/no): ").strip().lower()
if save_model == 'yes':
    import joblib
    save_path = input("Enter the path to save the model (e.g., 'model.pkl'): ").strip()
    joblib.dump(model, save_path)
    print(f"Model saved at {save_path}")
else:
    print("Model not saved.")


Logistic Regression was selected as the best model due to its simplicity, interpretability, and strong performance metrics (accuracy, precision, recall, F1-score). It demonstrated consistent results across training and testing datasets. I choose logistic regression.