In [1]:
import pandas as pd
df = pd.read_csv("./PCOS_data_without_infertility.csv")

## 把需要抽血的指數移除

In [2]:
new_df = df.drop(['Hb(g/dl)', 
                  'I   beta-HCG(mIU/mL)',
                  'II    beta-HCG(mIU/mL)',
                  'FSH(mIU/mL)', 'LH(mIU/mL)', 
                  'FSH/LH', 
                  'TSH (mIU/L)', 
                  'AMH(ng/mL)', 
                  'PRL(ng/mL)', 
                  'Vit D3 (ng/mL)', 
                  'RBS(mg/dl)', 
                  'Follicle No. (L)', 
                  'Follicle No. (R)', 
                  'Avg. F size (L) (mm)', 
                  'Avg. F size (R) (mm)', 
                  'Endometrium (mm)'], axis = 1)

In [3]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Sl. No                 541 non-null    float64
 1   Patient File No.       541 non-null    float64
 2   PCOS (Y/N)             541 non-null    float64
 3    Age (yrs)             541 non-null    float64
 4   Weight (Kg)            541 non-null    float64
 5   Height(Cm)             541 non-null    float64
 6   BMI                    541 non-null    object 
 7   Blood Group            541 non-null    float64
 8   *Pulse rate(bpm)       541 non-null    float64
 9   RR (breaths/min)       541 non-null    float64
 10  Cycle(R/I)             541 non-null    float64
 11  Cycle length(days)     541 non-null    float64
 12  Marraige Status (Yrs)  540 non-null    float64
 13  Pregnant(Y/N)          541 non-null    float64
 14  No. of aborptions      541 non-null    float64
 15  Hip(in

## 把不需要的欄位移除

In [4]:
new_df = new_df.drop(['BMI', 'Waist:Hip Ratio', 'Unnamed: 44', 'Sl. No', 'Patient File No.'], axis = 1)

## 移除空值

In [5]:
new_df = new_df.dropna()

In [6]:
new_df.isnull().sum()

PCOS (Y/N)               0
 Age (yrs)               0
Weight (Kg)              0
Height(Cm)               0
Blood Group              0
*Pulse rate(bpm)         0
RR (breaths/min)         0
Cycle(R/I)               0
Cycle length(days)       0
Marraige Status (Yrs)    0
Pregnant(Y/N)            0
No. of aborptions        0
Hip(inch)                0
Waist(inch)              0
PRG(ng/mL)               0
Weight gain(Y/N)         0
hair growth(Y/N)         0
Skin darkening (Y/N)     0
Hair loss(Y/N)           0
Pimples(Y/N)             0
Fast food (Y/N)          0
Reg.Exercise(Y/N)        0
BP _Systolic (mmHg)      0
BP _Diastolic (mmHg)     0
dtype: int64

In [7]:
x = new_df.drop(['PCOS (Y/N)'],axis=1)
y = new_df['PCOS (Y/N)']

In [8]:
x['BMI'] = x['Weight (Kg)']/((x['Height(Cm) ']/100)**2)
x['Waist:Hip Ratio'] = x['Waist(inch)']/x['Hip(inch)']
x = x.drop(['Weight (Kg)', 'Height(Cm) ', 'Waist(inch)', 'Hip(inch)'], axis=1)

## XGBOOST


In [9]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb

predicted = []
expected = []



model = xgb.XGBClassifier()
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)

model.fit(x_train, y_train)

expected.extend(y_test)
predicted.extend(model.predict(x_test))
print(metrics.classification_report(expected, predicted))

              precision    recall  f1-score   support

         0.0       0.88      0.88      0.88        80
         1.0       0.64      0.64      0.64        28

    accuracy                           0.81       108
   macro avg       0.76      0.76      0.76       108
weighted avg       0.81      0.81      0.81       108



In [10]:
from sklearn.model_selection import cross_val_score
import xgboost as xgb
scores = cross_val_score(xgb.XGBClassifier(), 
                              x, y, 
                              cv=10, scoring='f1_weighted')
print(scores)
print('10 folds CV with weighted-F1: ', scores.mean())

[0.88433048 0.92288699 0.87133435 0.69417989 0.70100393 0.76081716
 0.81481481 0.90895578 0.5814055  0.75657511]
10 folds CV with weighted-F1:  0.7896304003771355


## 隨機森林

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 

predicted = []
expected = []

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=5, test_size=0.2) 
model = RandomForestClassifier()

model.fit(x_train, y_train)

expected.extend(y_test)
predicted.extend(model.predict(x_test))
print(metrics.classification_report(expected, predicted))

              precision    recall  f1-score   support

         0.0       0.92      0.90      0.91        77
         1.0       0.76      0.81      0.78        31

    accuracy                           0.87       108
   macro avg       0.84      0.85      0.84       108
weighted avg       0.87      0.87      0.87       108



In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
scores = cross_val_score(RandomForestClassifier(), 
                              x, y, 
                              cv=10, scoring='f1_weighted')
print(scores)
print('10 folds CV with weighted-F1: ', scores.mean())

[0.90235543 0.96229288 0.90809596 0.80190817 0.75313131 0.77777778
 0.82909091 0.92592593 0.5952381  0.74022089]
10 folds CV with weighted-F1:  0.8196037363861283


## Decision Tree

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)
print(metrics.classification_report(expected, predicted))

              precision    recall  f1-score   support

         0.0       0.77      0.81      0.79        74
         1.0       0.53      0.47      0.50        34

    accuracy                           0.70       108
   macro avg       0.65      0.64      0.64       108
weighted avg       0.69      0.70      0.70       108



In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
scores = cross_val_score(DecisionTreeClassifier(), 
                              x, y, 
                              cv=10, scoring='f1_weighted')
print(scores)
print('10 folds CV with weighted-F1: ', scores.mean())

[0.74697856 0.72761494 0.78312448 0.60416667 0.63119676 0.62374139
 0.74074074 0.72686734 0.60821918 0.82106313]
10 folds CV with weighted-F1:  0.7013713170831242


## NB Classifier

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
model = MultinomialNB()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)
print(metrics.classification_report(expected, predicted))

              precision    recall  f1-score   support

         0.0       0.94      0.82      0.88        77
         1.0       0.66      0.87      0.75        31

    accuracy                           0.83       108
   macro avg       0.80      0.84      0.81       108
weighted avg       0.86      0.83      0.84       108



In [18]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(MultinomialNB(), 
                              x, y, 
                              cv=10, scoring='f1_weighted')
print(scores)
print('10 folds CV with weighted-F1: ', scores.mean())

[0.88433048 0.98132435 0.90920498 0.85569986 0.89022556 0.69417989
 0.79970271 0.94537347 0.63678715 0.75657511]
10 folds CV with weighted-F1:  0.8353403571802565
