In [None]:
#importing necessary libraries
import pandas as pd
import re
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [None]:
#defining a function to list all the patients with their features
def separate_info(filename):
  with open(filename, "r") as f :
    content = f.read()

  pattern = r"The (\w+) value is ([\d\.\-]+)"   #pattern to match the features
  features = re.findall(pattern,content)        #find all features in content

  patients = []
  current_patient = {}

  #changing the integers values to floats
  for key , value_str in features:
    try:
      value = float(value_str)
    except ValueError:
      value = value_str

    #adding the previous patient to the list
    if key == "PatientID":
      if current_patient:
        patients.append(current_patient)
      current_patient = {"PatientID": value}
    else:
      current_patient[key] = value

  #adding the last patient to the list
  if current_patient:
    patients.append(current_patient)

  return pd.DataFrame(patients)


In [None]:
#making the diagnosis table
def separate_targets(filename):
  with open(filename, "r") as f :
    content = f.read()

  pattern = r"The PatientID value is ([\d]+) \. The Diagnosis value is ([\d]+|MASKED)"
  targets = re.findall(pattern,content)

  target_list = []
  for patientID_str, diagnosis in targets:
    patientID = int(patientID_str)
    target_list.append({'PatientID': patientID, 'Diagnosis': diagnosis})

  return pd.DataFrame(target_list)


In [None]:
df_info = separate_info("all_info.txt")           #all info table
df_target = separate_targets("all_target.txt")    #diagnosis table

#merging the two tables for every patient ID
df_merged = pd.merge(df_info, df_target, on='PatientID', how='inner')

#table to train the model with 0's and 1's
df_train = df_merged[df_merged['Diagnosis'] != 'MASKED'].copy()

#table with only masked values for the model to predict
df_predict = df_merged[df_merged['Diagnosis'] == 'MASKED'].copy()

#making the 0's and 1's integer
df_train['Diagnosis'] = df_train['Diagnosis'].astype(int)

#list of all the features to train the model
FEATURES = df_train.columns.drop(['PatientID', 'Diagnosis']).tolist()

X_train = df_train[FEATURES]
y_train = df_train["Diagnosis"]

#features of the masked values
X_predict = df_predict[FEATURES]

#initialize the Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)

#model is trained in this step
dt_model.fit(X_train, y_train)

#THIS IS THE NUMPY ARRAY
dt_predictions = dt_model.predict(X_predict)

print(dt_predictions)

[1 1 1 0 0 0 1 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0
 0 1 1 1 1 0 1 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 1 1 0 0 0 0
 0 0 0 0 1 0 0 1 1 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1 0 0 1 1 0 1 0 0 0
 1 0 1 0 0 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 1 0 1 1 0 0 1 0 1 0 0 1 1 1 1 1 1 1 0 1 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0
 1 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 0
 0 1 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0 0
 0 0 0 0 1 0 0 1 1 0 1 1 1 0 0 0 1 1 0 1 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 1 0
 0 0 0 0 1 0 1 1 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 1 0
 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0
 1 1 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 1 1
 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 1 0 1 0 1 0 0
 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 1 0 1 0 1 0 1 0 1 0 0 0 1 0 0
 1 1 0 1 0 0 0 0 0 1 0 0 

In [None]:
np.save("predictions.npy", dt_predictions)