In [12]:


import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report



In [3]:
df = pd.read_csv("xray_input.csv")
df.head()

Unnamed: 0,Image Index,x0,x1,x2,x3,x4,x5,x6,x7,x8,...,Edema,Emphysema,Fibrosis,Effusion,Pneumonia,Pleural_Thickening,Cardiomegaly,Nodule,Mass,Hernia
0,00000001_000.png,-0.187979,0.043346,-0.005866,-0.180116,-0.216491,0.181554,0.053038,-0.242247,-0.196432,...,0,0,0,0,0,0,1,0,0,0
1,00000001_001.png,-0.173904,0.104313,-0.027356,0.061408,-0.213318,0.29748,0.328217,-0.221172,-0.119333,...,0,1,0,0,0,0,1,0,0,0
2,00000001_002.png,-0.156875,0.002193,-0.183765,0.010118,-0.242131,0.274381,-0.061397,0.085868,-0.178546,...,0,0,0,1,0,0,1,0,0,0
3,00000002_000.png,-0.197439,0.289834,-0.143378,0.2384,-0.159914,0.132197,-0.086083,-0.187805,-0.050839,...,0,0,0,0,0,0,0,0,0,0
4,00000003_000.png,-0.123684,-0.005224,-0.083841,0.031354,-0.202258,0.914344,0.074426,-0.22206,-0.199442,...,0,0,0,0,0,0,0,0,0,1


In [None]:


# Assuming df is your dataframe with features x0-x2048 and the 14 label columns
# Separate features and labels
X = df.filter(regex='^x\d+')
y = df[["Atelectasis", "Consolidation", "Infiltration", "Pneumothorax", "Edema",
       "Emphysema", "Fibrosis", "Effusion", "Pneumonia", "Pleural_Thickening",
       "Cardiomegaly", "Nodule", "Mass", "Hernia"]]

# Create a column indicating if all labels are 0 (no findings)
y['no_finding'] = (y.sum(axis=1) == 0).astype(int)

# Split the data first to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(
    X, y.drop('no_finding', axis=1), 
    test_size=0.2, 
    random_state=42, 
    stratify=y['no_finding']
)

# Scale the data before applying PCA
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA - try reducing to ~100-200 components to start
# You can adjust n_components based on your RAM constraints and the variance explained
n_components = 100  # Starting point, adjust as needed
pca = PCA(n_components=n_components, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Check explained variance to ensure we've captured enough information
explained_variance = np.sum(pca.explained_variance_ratio_)
print(f"Total explained variance with {n_components} components: {explained_variance:.4f}")

# Now proceed with your balanced sampling as before
no_findings = y_train.sum(axis=1) == 0
X_train_no_findings = X_train_pca[no_findings]
y_train_no_findings = y_train[no_findings]
X_train_findings = X_train_pca[~no_findings]
y_train_findings = y_train[~no_findings]

# Sample only a portion of the no-findings records
sample_size = int(len(X_train_no_findings) * 0.05)  # Adjust ratio as needed
indices = np.random.choice(len(X_train_no_findings), sample_size, replace=False)
X_train_no_findings_sampled = X_train_no_findings[indices]
y_train_no_findings_sampled = y_train_no_findings.iloc[indices]

# Combine the sampled no-findings with all findings samples
X_train_balanced = np.vstack([X_train_no_findings_sampled, X_train_findings])
y_train_balanced = pd.concat([y_train_no_findings_sampled, y_train_findings])


  X = df.filter(regex='^x\d+')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['no_finding'] = (y.sum(axis=1) == 0).astype(int)


Total explained variance with 40 components: 0.6891


In [18]:
X_train_balanced

array([[  9.92501332,  -6.41496342,  -1.99501203, ...,   2.35942752,
          1.27022094,   0.55714801],
       [-12.77496535, -12.26348298,  -9.3016874 , ...,   1.4873168 ,
          0.84447726,  -0.57561964],
       [ 18.6370205 ,  22.6347324 ,   7.74558895, ...,   2.71863874,
          1.88977786,  -2.99513956],
       ...,
       [ -4.22392115,   8.18450975,  -0.90716789, ...,   6.52066556,
         -0.23765357,   5.04820798],
       [-11.7175822 ,  15.33558601, -11.49012356, ...,  -2.16372887,
          3.34638274,   2.12360637],
       [ 10.33271329,  -7.85361759,   9.00489929, ...,  -1.45122805,
         -2.46506326,   1.07029737]], shape=(53479, 40))

In [19]:
y_train_balanced

Unnamed: 0,Atelectasis,Consolidation,Infiltration,Pneumothorax,Edema,Emphysema,Fibrosis,Effusion,Pneumonia,Pleural_Thickening,Cardiomegaly,Nodule,Mass,Hernia
24497,0,0,0,0,0,0,0,0,0,0,0,0,0,0
42727,0,0,0,0,0,0,0,0,0,0,0,0,0,0
65580,0,0,0,0,0,0,0,0,0,0,0,0,0,0
74934,0,0,0,0,0,0,0,0,0,0,0,0,0,0
65545,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39768,1,1,0,0,0,0,0,0,0,0,0,0,0,0
79030,1,1,1,0,0,0,1,1,0,0,0,0,1,0
103080,0,0,0,0,0,1,0,0,0,1,0,1,0,0
96763,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [20]:

# Train multi-label SVC with the reduced dimensionality data
svc = SVC(kernel='rbf', C=0.1, probability=True)
multi_svc = MultiOutputClassifier(svc, n_jobs=-1)
multi_svc.fit(X_train_balanced, y_train_balanced)
print("Finished Fit")

# Predict and evaluate
y_pred = multi_svc.predict(X_test_pca)

# Evaluate per class
for i, label in enumerate(y_train.columns):
    print(f"Classification report for {label}:")
    print(classification_report(y_test.iloc[:, i], y_pred[:, i]))


python(74572) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(74573) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(74574) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(74575) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(74576) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(74577) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(74578) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(74579) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(74580) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(74581) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Finished Fit


python(79657) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(79658) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(79659) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(79660) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(79661) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(79662) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(79663) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Classification report for Atelectasis:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     20114
           1       0.00      0.00      0.00      2310

    accuracy                           0.90     22424
   macro avg       0.45      0.50      0.47     22424
weighted avg       0.80      0.90      0.85     22424

Classification report for Consolidation:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     21518
           1       0.00      0.00      0.00       906

    accuracy                           0.96     22424
   macro avg       0.48      0.50      0.49     22424
weighted avg       0.92      0.96      0.94     22424

Classification report for Infiltration:
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     18442
           1       0.00      0.00      0.00      3982

    accuracy                           0.82     22424
   macro

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize