In [36]:
import numpy as np
import pandas as pd

In [102]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.svm import SVC
import joblib

In [38]:
file_path_with_infertility="./PCOS_infertility.csv"
file_path_without_infertility="./PCOS_data_without_infertility.xlsx"

PCOS_inf = pd.read_csv(file_path_with_infertility)
PCOS_woinf = pd.read_excel(file_path_without_infertility, sheet_name="Full_new")

In [39]:
data = pd.merge(PCOS_woinf,PCOS_inf, on='Patient File No.', suffixes=('','_y'),how='left')

data =data.drop(['Unnamed: 44', 'Sl. No', 'Sl. No_y', 'PCOS (Y/N)_y', '  I   beta-HCG(mIU/mL)_y',
       'II    beta-HCG(mIU/mL)_y', 'AMH(ng/mL)_y', 'Patient File No.'], axis=1)

In [40]:
data["AMH(ng/mL)"] = pd.to_numeric(data["AMH(ng/mL)"], errors='coerce')
data["II    beta-HCG(mIU/mL)"] = pd.to_numeric(data["II    beta-HCG(mIU/mL)"], errors='coerce')

data.columns = [col.strip() for col in data.columns]

In [42]:
data = data[(data["BP _Diastolic (mmHg)"]>20)]
data = data[(data["AMH(ng/mL)"]<40)]
data = data[(data["BP _Systolic (mmHg)"]>20)]
data = data[(data["Endometrium (mm)"]>0)]
data = data[(data["Avg. F size (R) (mm)"]>0)]
data = data[(data["Avg. F size (R) (mm)"]>0)]
data = data[(data["RBS(mg/dl)"]<200)]
data = data[(data["PRG(ng/mL)"]<20)]
data = data[(data["Pulse rate(bpm)"]>20)]
data = data[(data["FSH(mIU/mL)"]<4000)]
data = data[(data["LH(mIU/mL)"]<1500)]
data = data[(data["Cycle(R/I)"]<4.5)]

In [43]:
X=data.drop(["PCOS (Y/N)"],axis = 1) 
y=data["PCOS (Y/N)"]

X_train,X_test, y_train, y_test = train_test_split(X,y, test_size=0.3) 

In [None]:
X_train

In [95]:
pipeline = Pipeline([ 
                     ('pca', PCA()),
                     ('svm', SVC())
                     ])
param_grid = {
    'pca__n_components':[i for i in range(1, len(X_train.columns))],
    'svc__C':[0.1, 1, 10, 100, 1000],
    'svc_kernel':['linear', 'rbf', 'poly'],
    'svc__gamma':['scale', 'auto'],
}
stack_gridcv = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid, cv= 5, n_jobs=-1
    )

In [96]:
stack_gridcv.fit(X_train, y_train)

In [97]:
stack_gridcv.best_params_

{'pca__n_components': 32, 'xgb__n_estimators': 200}

In [98]:
stack_gridcv.predict(X_test)

array([1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0])

In [99]:
stack_gridcv.score(X_test, y_test)

0.8734177215189873

In [65]:
y_test.value_counts()

0    107
1     51
Name: PCOS (Y/N), dtype: int64

In [101]:
joblib.dump(stack_gridcv, 'model.pkl')

['model.pkl']