## Train the Models

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from visualize import SYMPTOMS, CAUSES

# convert gender to number
def gender_mapper(gender): return 1 if gender == "M" else 0

# demographics which may be helpful in prediction
DEMOGRAPHICS = ["GENDER", "AGE"]

# model types to test
MODELS = ["random_forest", "naive_bayes", "svc"]

# reads data from the csv file
df = pd.read_csv("data/lung-cancer.csv", usecols=SYMPTOMS+CAUSES)

# changes 2's to 1's and 1's to 0's - convert to binary for yes and no
for stat in df:
    df[stat] = df[stat] - 1

# get demographic data
demo_df = pd.read_csv("data/lung-cancer.csv", usecols=DEMOGRAPHICS)
for stat in demo_df:
    if stat == "GENDER":    # set males to 1, females to 0
        df[stat] = list(map(gender_mapper, demo_df[stat]))
    else:
        df[stat] = demo_df[stat]

# sorts the dataframe
final_df = pd.DataFrame({})
for thing in SYMPTOMS+CAUSES+DEMOGRAPHICS:
    final_df[thing] = df[thing]

# gets if they have cancer or not
results = pd.read_csv("data/lung-cancer.csv", usecols=["LUNG_CANCER"])["LUNG_CANCER"]

# split into train and test
train_x, test_x, train_y, test_y = train_test_split(final_df, results)

# create models
random_forest = RandomForestClassifier(random_state=101, n_estimators=100)
naive_bayes = MultinomialNB()
svc = SVC()

# train and test models
for name in MODELS:

    # gets model based on what the string is
    model  = eval(name)
    model.fit(train_x, train_y)

    # test model accuracy
    predicted_y = model.predict(test_x)
    accuracy = accuracy_score(test_y, predicted_y)

    print(f"The accuracy of the {name} model is {round(100 * accuracy, 2)}%.")

The accuracy of the random_forest model is 94.87%.
The accuracy of the naive_bayes model is 89.74%.
The accuracy of the svc model is 92.31%.


## Use the Models

In [13]:
# gets the input
print("For the following questions, enter \"1\" if the symptom/cause applies to you, and \"0\" if not. For \"GENDER\", answer 'M' or 'F', and for \"AGE\", enter your age as an integer.")

# creates dataframe and blank results 
df = pd.DataFrame({})
results = []

# creates arrays with the symptoms and causes
for thing in SYMPTOMS+CAUSES+DEMOGRAPHICS:
    if thing == "GENDER":
        df[thing] = [gender_mapper(input(thing))]
    else:
        df[thing] = [input(thing)]

# evalulates each model
for name in MODELS:
    model = eval(name)
    print(f"The {name.upper()} model predicts that you {'do not ' if model.predict(df)[0] == 'NO' else ''}have lung cancer")


For the following questions, enter "1" if the symptom/cause applies to you, and "0" if not. For "GENDER", answer 'M' or 'F', and for "AGE", enter your age as an integer.
The random_forest model predicts that you  have lung cancer
The naive_bayes model predicts that you  have lung cancer
The svc model predicts that you  have lung cancer
