In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv('/content/drive/MyDrive/microbis_data/amr_data_long_format_no_genes.csv',
                 dtype={'MIC': 'str', 'MIC_Interpretation': 'str'})

# Define df_MIC by dropping 'MIC_Interpretation' column
df_MIC = df.drop(columns=['MIC_Interpretation'])

# Drop all rows where 'MIC' is NaN
df_MIC = df_MIC.dropna(subset=['MIC'])

# Define the target variable (y) and the feature set (X)
y = df_MIC['MIC']
X = df_MIC.drop(columns=['MIC'])

# Convert categorical columns to 'category' dtype
categorical_columns = ['Phenotype', 'Species', 'Family', 'Country', 'State', 
                       'Gender', 'Age Group', 'Speciality', 'Source', 'In / Out Patient', 'Antibiotic']

for col in categorical_columns:
    if col in X.columns:
        X[col] = X[col].astype('category')

# Convert any numeric columns to the appropriate type if necessary
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns

# Optionally convert other numeric-like columns that might be object types
for col in numeric_columns:
    X[col] = pd.to_numeric(X[col], errors='coerce')

# Identify classes with only one instance
class_counts = y.value_counts()
rare_classes = class_counts[class_counts == 1].index

# Filter out the rows with these rare classes
X = X[~y.isin(rare_classes)]
y = y[~y.isin(rare_classes)]

from sklearn.preprocessing import LabelEncoder

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded)

In [None]:
import pandas as pd

# Convert y_train back to DataFrame and name the column 'MIC'
y_train_df = pd.DataFrame(y_train, columns=['MIC'])

# Combine X_train and y_train into a single DataFrame
train_set = pd.concat([X_train, y_train_df], axis=1)

# Save the combined DataFrame to a CSV file
train_set.to_csv('/content/drive/MyDrive/microbis_data/train_sets/MIC_train_set.csv', index=False)

print("Train set saved successfully as MIC_train_set.csv")

In [None]:
import pandas as pd
import json

# Extract unique, sorted values for each categorical column in X_train
unique_values_dict = {}

for col in categorical_columns:
    if col in X_train.columns:
        unique_values = sorted(X_train[col].dropna().unique().tolist())  # Convert to list for JSON serialization
        unique_values_dict[col] = unique_values

# Save the dictionary to a JSON file
with open('/content/drive/MyDrive/microbis_data/category_features_unique_values/mic_classification_best_unique_categories.json', 'w') as json_file:
    json.dump(unique_values_dict, json_file, indent=4)

print("Unique categories JSON file created successfully.")