<a href="https://colab.research.google.com/github/swarubm/thyroid-MP/blob/main/1cell_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np


# Load the dataset
df = pd.read_csv('/content/hypothyroid (3).csv')
print(df.shape)             # 3772 rows, 30 columns

df.head()

print(df.shape)             # 3772 rows, 30 columns
print(df['Class'].value_counts())  # Check class distribution

# Replace '?' with NaN and drop columns with all or many missing values
df = df.replace('?', np.nan)
df.drop(columns=['TBG_measured','TBG','referral_source','TSH_measured','T3_measured','TT4_measured','T4U_measured','FTI_measured'], inplace=True)

# Map binary features 't'/'f' to 1/0
binary_cols = ['on_thyroxine','query_on_thyroxine','on_antithyroid_medication','sick',
               'pregnant','thyroid_surgery','I131_treatment','query_hypothyroid',
               'query_hyperthyroid','lithium','goitre','tumor','hypopituitary','psych']

for col in binary_cols:
    df[col] = df[col].map({'t':1,'f':0})
# Convert sex to numeric (M=1, F=0), imputing missing with mode
df['sex'] = df['sex'].map({'M':1,'F':0})
df['sex'].fillna(df['sex'].mode()[0], inplace=True)

# Convert numeric columns to float
num_cols = ['age','TSH','T3','TT4','T4U','FTI']
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows still having missing target or too many missing features (if any)
df.dropna(subset=['Class'], inplace=True)

# Create binary target: 0 = negative (no disease), 1 = any hypothyroid condition
df['disease'] = (df['Class'] != 'negative').astype(int)
df.drop(columns=['Class'], inplace=True)
print("Shape after cleaning:", df.shape)
print(df.isnull().sum())  # Check remaining missing values


import seaborn as sns
import matplotlib.pyplot as plt

# Class imbalance
print(df['disease'].value_counts())

# Feature statistics
#print(df.describe())
print(df.drop(['age', 'sex'], axis=1).describe())


# Check correlation of numeric features
corr = df[num_cols + ['TSH','T3','TT4','T4U']].corr()
print("Correlation matrix:\n", corr.round(2))


import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap of Numeric Features")
plt.show()


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_features = ['age','TSH','T3','TT4','T4U']
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_features = ['sex']
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


from sklearn.model_selection import train_test_split

X = df[numeric_features + ['sex'] + binary_cols]  # features after cleaning
y = df['disease']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Train set class distribution:", y_train.value_counts(normalize=True).to_dict())
print("Test set class distribution:", y_test.value_counts(normalize=True).to_dict())


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Define pipelines
pipe_lr = Pipeline([('prep', preprocessor), ('clf', LogisticRegression(max_iter=1000))])
pipe_rf = Pipeline([('prep', preprocessor), ('clf', RandomForestClassifier(n_estimators=100, random_state=42))])
pipe_xgb = Pipeline([('prep', preprocessor), ('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))])

# Train models
for name, pipe in [("Logistic Regression", pipe_lr),
                   ("Random Forest", pipe_rf),
                   ("XGBoost", pipe_xgb)]:
    pipe.fit(X_train, y_train)
    print(f"{name} trained.")


from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# Existing pipelines
pipe_lr = Pipeline([('prep', preprocessor),
                    ('clf', LogisticRegression(max_iter=1000, random_state=42))])

pipe_rf = Pipeline([('prep', preprocessor),
                    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))])

pipe_xgb = Pipeline([('prep', preprocessor),
                     ('clf', XGBClassifier(use_label_encoder=False,
                                           eval_metric='logloss',
                                           random_state=42))])

# Additional pipelines
pipe_svm = Pipeline([('prep', preprocessor),
                     ('clf', SVC(probability=True, random_state=42))])

pipe_knn = Pipeline([('prep', preprocessor),
                     ('clf', KNeighborsClassifier(n_neighbors=5))])

pipe_dt = Pipeline([('prep', preprocessor),
                    ('clf', DecisionTreeClassifier(random_state=42))])

pipe_gb = Pipeline([('prep', preprocessor),
                    ('clf', GradientBoostingClassifier(random_state=42))])

pipe_nb = Pipeline([('prep', preprocessor),
                    ('clf', GaussianNB())])

# Collect all pipelines in a dictionary for easy iteration
pipelines = {
    'Logistic Regression': pipe_lr,
    'Random Forest': pipe_rf,
    'XGBoost': pipe_xgb,
    'SVM': pipe_svm,
    'KNN': pipe_knn,
    'Decision Tree': pipe_dt,
    'Gradient Boosting': pipe_gb,
    'Naive Bayes': pipe_nb
}


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Dictionary with all your pipelines
models = {
    "LogisticRegression": pipe_lr,
    "RandomForest": pipe_rf,
    "XGBoost": pipe_xgb,
    "SVM": pipe_svm,
    "KNN": pipe_knn,
    "DecisionTree": pipe_dt,
    "GradientBoosting": pipe_gb,
    "NaiveBayes": pipe_nb
}

# Train and evaluate
for name, model in models.items():
    # Fit on training data
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    # Evaluation
    print(f"\n{name} evaluation:")
    print(" Accuracy:", accuracy_score(y_test, y_pred))
    print(" Precision:", precision_score(y_test, y_pred))
    print(" Recall:", recall_score(y_test, y_pred))
    print(" F1-score:", f1_score(y_test, y_pred))
    if y_proba is not None:
        print(" ROC-AUC:", roc_auc_score(y_test, y_proba))
    else:
        print(" ROC-AUC: Not available (model does not support probability estimates)")
    print(" Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


import numpy as np
import matplotlib.pyplot as plt

# Get feature names after one-hot encoding (sex) and original order
ohe = pipe_rf.named_steps['prep'].named_transformers_['cat'].named_steps['onehot']
feat_names = numeric_features + list(binary_cols) + ['sex_M']  # sex encoded as one column
importances = pipe_rf.named_steps['clf'].feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(6,4))
plt.title("Feature importances (Random Forest)")
plt.bar(range(len(importances)), importances[indices], align='center')
plt.xticks(range(len(importances)), np.array(feat_names)[indices], rotation=90)
plt.tight_layout()
plt.show()


import joblib
best_model = pipe_xgb  # suppose XGBoost was best
joblib.dump(best_model, 'thyroid_detection_pipeline.joblib')
print("Pipeline saved to 'thyroid_detection_pipeline.joblib'.")


from flask import Flask, request, jsonify
import joblib
import pandas as pd

app = Flask(__name__)
model = joblib.load('thyroid_detection_pipeline.joblib')  # load saved pipeline

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()  # expects JSON with feature values
    df = pd.DataFrame([data])
    prediction = model.predict(df)[0]
    return jsonify({'prediction': int(prediction)})

if __name__ == '__main__':
    app.run(port=5000)


# all in one
import pandas as pd
import numpy as np
# Load the dataset
df = pd.read_csv(r'C:\Users\swaro\Downloads\hypothyroid (3).csv')
print(df.shape)             # 3772 rows, 30 columns
print(df.shape)             # 3772 rows, 30 columns
print(df['Class'].value_counts())  # Check class distribution# Replace '?' with NaN and drop columns with all or many missing values
df = df.replace('?', np.nan)
df.drop(columns=['TBG_measured','TBG','referral_source','TSH_measured','T3_measured','TT4_measured','T4U_measured','FTI_measured'], inplace=True)

# Map binary features 't'/'f' to 1/0
binary_cols = ['on_thyroxine','query_on_thyroxine','on_antithyroid_medication','sick',
               'pregnant','thyroid_surgery','I131_treatment','query_hypothyroid',
               'query_hyperthyroid','lithium','goitre','tumor','hypopituitary','psych']
for col in binary_cols:
    df[col] = df[col].map({'t':1,'f':0})
# Convert sex to numeric (M=1, F=0), imputing missing with mode
df['sex'] = df['sex'].map({'M':1,'F':0})
df['sex'].fillna(df['sex'].mode()[0], inplace=True)

# Convert numeric columns to float
num_cols = ['age','TSH','T3','TT4','T4U','FTI']
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows still having missing target or too many missing features (if any)
df.dropna(subset=['Class'], inplace=True)

# Create binary target: 0 = negative (no disease), 1 = any hypothyroid condition
df['disease'] = (df['Class'] != 'negative').astype(int)
df.drop(columns=['Class'], inplace=True)
print("Shape after cleaning:", df.shape)
print(df.isnull().sum())  # Check remaining missing values
import seaborn as sns
import matplotlib.pyplot as plt

# Class imbalance
print(df['disease'].value_counts())
# Feature statistics
print(df.describe())
# Check correlation of numeric features
corr = df[num_cols + ['TSH','T3','TT4','T4U']].corr()
print("Correlation matrix:\n", corr.round(2))
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_features = ['age','TSH','T3','TT4','T4U']
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_features = ['sex']
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])
from sklearn.model_selection import train_test_split

X = df[numeric_features + ['sex'] + binary_cols]  # features after cleaning
y = df['disease']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Train set class distribution:", y_train.value_counts(normalize=True).to_dict())
print("Test set class distribution:", y_test.value_counts(normalize=True).to_dict())
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# Existing pipelines
pipe_lr = Pipeline([('prep', preprocessor),
                    ('clf', LogisticRegression(max_iter=1000, random_state=42))])

pipe_rf = Pipeline([('prep', preprocessor),
                    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))])

pipe_xgb = Pipeline([('prep', preprocessor),
                     ('clf', XGBClassifier(use_label_encoder=False,
                                           eval_metric='logloss',
                                           random_state=42))])

# Additional pipelines
pipe_svm = Pipeline([('prep', preprocessor),
                     ('clf', SVC(probability=True, random_state=42))])

pipe_knn = Pipeline([('prep', preprocessor),
                     ('clf', KNeighborsClassifier(n_neighbors=5))])

pipe_dt = Pipeline([('prep', preprocessor),
                    ('clf', DecisionTreeClassifier(random_state=42))])

pipe_gb = Pipeline([('prep', preprocessor),
                    ('clf', GradientBoostingClassifier(random_state=42))])

pipe_nb = Pipeline([('prep', preprocessor),
                    ('clf', GaussianNB())])

# Collect all pipelines in a dictionary for easy iteration
pipelines = {
    'Logistic Regression': pipe_lr,
    'Random Forest': pipe_rf,
    'XGBoost': pipe_xgb,
    'SVM': pipe_svm,
    'KNN': pipe_knn,
    'Decision Tree': pipe_dt,
    'Gradient Boosting': pipe_gb,
    'Naive Bayes': pipe_nb
}
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Dictionary with all your pipelines
models = {
    "LogisticRegression": pipe_lr,
    "RandomForest": pipe_rf,
    "XGBoost": pipe_xgb,
    "SVM": pipe_svm,
    "KNN": pipe_knn,
    "DecisionTree": pipe_dt,
    "GradientBoosting": pipe_gb,
    "NaiveBayes": pipe_nb
}

# Train and evaluate
for name, model in models.items():
    # Fit on training data
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    # Evaluation
    print(f"\n{name} evaluation:")
    print(" Accuracy:", accuracy_score(y_test, y_pred))
    print(" Precision:", precision_score(y_test, y_pred))
    print(" Recall:", recall_score(y_test, y_pred))
    print(" F1-score:", f1_score(y_test, y_pred))
    if y_proba is not None:
        print(" ROC-AUC:", roc_auc_score(y_test, y_proba))
    else:
        print(" ROC-AUC: Not available (model does not support probability estimates)")
    print(" Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
import numpy as np
import matplotlib.pyplot as plt

# Get feature names after one-hot encoding (sex) and original order
ohe = pipe_rf.named_steps['prep'].named_transformers_['cat'].named_steps['onehot']
feat_names = numeric_features + list(binary_cols) + ['sex_M']  # sex encoded as one column
importances = pipe_rf.named_steps['clf'].feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(6,4))
plt.title("Feature importances (Random Forest)")
plt.bar(range(len(importances)), importances[indices], align='center')
plt.xticks(range(len(importances)), np.array(feat_names)[indices], rotation=90)
plt.tight_layout()
plt.show()
import joblib
best_model = pipe_xgb  # suppose XGBoost was best
joblib.dump(best_model, 'thyroid_detection_pipeline.joblib')
print("Pipeline saved to 'thyroid_detection_pipeline.joblib'.")



FileNotFoundError: [Errno 2] No such file or directory: '/content/hypothyroid (3).csv'