In [1]:
import pandas as pd

# Load the dataset (without 'text' column, and with 'label' as the last column)
df_no_text = pd.read_csv('data/no_text_mhc_revision_5.csv')

# Drop the specified columns
columns_to_drop = ['most_common_4-gram', 'num_most_common_4-gram', 'most_common_5-gram', 'num_most_common_5-gram']
df_no_text_dropped = df_no_text.drop(columns=columns_to_drop)

# Remove rows where any column contains the word "filler"
df_filtered = df_no_text_dropped[~df_no_text_dropped.apply(lambda row: row.astype(str).str.contains('filler').any(), axis=1)]

# Save the resulting DataFrame to a new CSV file
df_filtered.to_csv('data/no_text_mhc_revision_5_2.csv', index=False)

# Confirm the changes
print(f"Saved the updated dataset with shape {df_filtered.shape} to 'data/no_text_mhc_revision_5_2.csv'.")

Saved the updated dataset with shape (27752, 27) to 'data/no_text_mhc_revision_5_2.csv'.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the new dataset
df_no_text = pd.read_csv('data/no_text_mhc_revision_5_2.csv')

# Replace missing values with 0
df_no_text.fillna(0, inplace=True)

# Convert categorical features to one-hot encoding (if any)
df_no_text_encoded = pd.get_dummies(df_no_text, drop_first=True)

# Limit the dataset to 4,000 samples with class balance
# Sample up to 2000 from each class (assuming binary classification)
df_balanced = df_no_text_encoded.groupby('label', group_keys=False).apply(lambda x: x.sample(min(len(x), 2000), random_state=42))
df_balanced = df_balanced.sample(n=min(4000, len(df_balanced)), random_state=42)

# Separate features (X) and target (y)
X = df_balanced.drop(columns=['label'])  # Features
y = df_balanced['label']  # Target variable

# Split data into training and testing sets (80% train, 20% test) with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features (scaling is important for both Logistic Regression and MLP)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---- Logistic Regression Model ----
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train)

# Predictions and evaluation for Logistic Regression
y_pred_logreg = logreg.predict(X_test_scaled)
logreg_accuracy = accuracy_score(y_test, y_pred_logreg)
print(f"Logistic Regression Accuracy: {logreg_accuracy * 100:.2f}%")
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logreg))

# ---- MLP Classifier Model ----
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp.fit(X_train_scaled, y_train)

# Predictions and evaluation for MLP Classifier
y_pred_mlp = mlp.predict(X_test_scaled)
mlp_accuracy = accuracy_score(y_test, y_pred_mlp)
print(f"MLP Classifier Accuracy: {mlp_accuracy * 100:.2f}%")
print("MLP Classifier Classification Report:")
print(classification_report(y_test, y_pred_mlp))

# ---- Comparing Performance ----
print(f"Logistic Regression Accuracy: {logreg_accuracy * 100:.2f}%")
print(f"MLP Classifier Accuracy: {mlp_accuracy * 100:.2f}%")
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the new dataset
df_no_text = pd.read_csv('data/no_text_mhc_revision_5_2.csv')

# Replace missing values with 0
df_no_text.fillna(0, inplace=True)

# Convert categorical features to one-hot encoding (if any)
df_no_text_encoded = pd.get_dummies(df_no_text, drop_first=True)

# Limit the dataset to 4,000 samples with class balance
# Sample up to 2000 from each class (assuming binary classification)
df_balanced = df_no_text_encoded.groupby('label', group_keys=False).apply(lambda x: x.sample(min(len(x), 2000), random_state=42))
df_balanced = df_balanced.sample(n=min(4000, len(df_balanced)), random_state=42)

# Separate features (X) and target (y)
X = df_balanced.drop(columns=['label'])  # Features
y = df_balanced['label']  # Target variable

# Split data into training and testing sets (80% train, 20% test) with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features (scaling is important for both Logistic Regression and MLP)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---- Logistic Regression Model ----
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train)

# Predictions and evaluation for Logistic Regression
y_pred_logreg = logreg.predict(X_test_scaled)
logreg_accuracy = accuracy_score(y_test, y_pred_logreg)
print(f"Logistic Regression Accuracy: {logreg_accuracy * 100:.2f}%")
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logreg))

# ---- MLP Classifier Model ----
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp.fit(X_train_scaled, y_train)

# Predictions and evaluation for MLP Classifier
y_pred_mlp = mlp.predict(X_test_scaled)
mlp_accuracy = accuracy_score(y_test, y_pred_mlp)
print(f"MLP Classifier Accuracy: {mlp_accuracy * 100:.2f}%")
print("MLP Classifier Classification Report:")
print(classification_report(y_test, y_pred_mlp))

# ---- Comparing Performance ----
print(f"Logistic Regression Accuracy: {logreg_accuracy * 100:.2f}%")
print(f"MLP Classifier Accuracy: {mlp_accuracy * 100:.2f}%")

  df_balanced = df_no_text_encoded.groupby('label', group_keys=False).apply(lambda x: x.sample(min(len(x), 2000), random_state=42))


Logistic Regression Accuracy: 75.38%
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.79      0.76       400
           1       0.78      0.71      0.74       400

    accuracy                           0.75       800
   macro avg       0.76      0.75      0.75       800
weighted avg       0.76      0.75      0.75       800

MLP Classifier Accuracy: 76.25%
MLP Classifier Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.80      0.77       400
           1       0.78      0.73      0.75       400

    accuracy                           0.76       800
   macro avg       0.76      0.76      0.76       800
weighted avg       0.76      0.76      0.76       800

Logistic Regression Accuracy: 75.38%
MLP Classifier Accuracy: 76.25%
