In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from string import punctuation
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import RocCurveDisplay
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

SEED = 456
#TEST = 0.2

In [None]:
X_train = pd.read_csv("X_train_restaurants.csv", index_col = [0])
X_test = pd.read_csv("X_test_restaurants.csv", index_col = [0])
y_train = pd.read_csv("y_train_restaurants.csv", index_col = [0])
y_test = pd.read_csv("y_test_restaurants.csv", index_col = [0])

In [None]:
X_train

In [None]:
y_train["GRADE"].value_counts()

# B and C unbalanced - generate synthetic samples with SMOTE 
# (risky because would need to generate more instnaces than exist currectly)
# OR
# undersample A (down to 4000 - 4500)


# decision trees (random forest - normal data)
# penalized SVM (normal data)
# THEN:
# SVM (over/undersampled data)
# naive bayes (oversampled data)


# eval metrics:
# ROC and AUC
# F1

In [None]:
X_train_sub = X_train.drop("VIOLATION DESCRIPTION", axis = 1)
X_test_sub = X_test.drop("VIOLATION DESCRIPTION", axis = 1)

cols = ['BORO', 'CUISINE DESCRIPTION','INSPECTION TYPE', 'INSPECTION PURPOSE']
X_train_sub = pd.get_dummies(X_train_sub, columns = cols)
X_test_sub = pd.get_dummies(X_test_sub, columns = cols)

X_train_sub

## Models

In [None]:
display(y_train["GRADE"].value_counts())

y_train["GRADE"].value_counts().values/len(X_train_sub)*100

## Random Forest

Key Assumption - ...

- Decision trees good for unbalanced data

In [None]:
from sklearn.model_selection import GridSearchCV

kfold = KFold(n_splits = 5)
parameters = {'class_weight':[None, "balanced", {"A":1, "B":3, "C":3}], 'min_samples_leaf':[100, 200, 300]}
rf = RandomForestClassifier(max_depth = 5, random_state = SEED)
best_rf = GridSearchCV(rf, parameters, cv = kfold)
best_rf.fit(X_train_sub, y_train["GRADE"])
best_rf.best_params_

In [None]:
rf_pred = best_rf.predict(X_test_sub)
f1_score(y_test["GRADE"], rf_pred, average = "weighted")

In [None]:
#X_train_sub.info()#.select_dtypes(include = ["int64"])
#type(best_rf_pred_num)
#type(grades)
#best_rf_pred_num.shape

X_train_sub["YEAR"].value_counts() #(2015 (only 2) - 2023)

In [None]:
#predicted = pd.DataFrame(best_rf_pred, columns = best_rf.classes_)
#display(predicted)
#col_max = predicted.max(axis = 1)
#for col in predicted:
    #predicted.loc[predicted[col] == col_max, col] = col

#predicted

In [None]:
#for i in predicted.index:
    #print(col)
    
#predicted["A"].astype("str") + predicted["B"].astype("str") + predicted["C"].astype("str")

In [None]:
#best_rf_pred_num = pd.Series(best_rf_pred)
grades = y_test["GRADE"].copy()

#for grade, value in {'A':'1', 'B':'2', 'C':'3'}.items():
    #best_rf_pred_num = best_rf_pred_num.replace(grade, value)
    #grades = grades.replace(grade, value)

#best_rf_pred_num = best_rf_pred_num.astype('int32')
#grades = grades.astype('int32')

# lol all this code was a waste of time
    
best_rf_pred = best_rf.predict_proba(X_test_sub)
rf_auc = roc_auc_score(grades, best_rf_pred, multi_class = "ovr", average = "weighted")

In [None]:
rf_auc

In [None]:
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
#y_onehot_test.shape

def plot_roc_curve(class_of_interest, prediction):
    class_id = np.flatnonzero(label_binarizer.classes_ == class_of_interest)[0]
    
    RocCurveDisplay.from_predictions(
        y_onehot_test[:, class_id],
        prediction[:, class_id],
        name = f"{class_of_interest} vs Other Classes",
        color = "dodgerblue"
    )
    plt.axis("square")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("One-vs-Rest ROC curves:\n{} vs Other Classes".format(class_of_interest))
    plt.plot([0, 1], [0, 1], linestyle = '--', lw = 2, color = 'black', label = 'Guideline')
    plt.legend()
    plt.show()
    
plot_roc_curve("A", best_rf_pred)
plot_roc_curve("B", best_rf_pred)
plot_roc_curve("C", best_rf_pred)

In [None]:
#best_rf.estimators_[0]
## not possible with grid search cv
#y_train["GRADE"]

In [None]:
#from sklearn import tree
#i = 99

#fn = X_train_sub.columns
#cn = y_train.GRADE.unique()
#fig, axes = plt.subplots(nrows = 1, ncols = 1, figsize = (3, 3), dpi = 1000)
#tree.plot_tree(best_rf.estimators_[i],
               #feature_names = fn, 
               #class_names = cn,
               #filled = True);
#fig.savefig('RF_Tree100.png')

In [None]:
# best model f1 score
#f1_score(y_test["GRADE"], best_rf.predict(X_test_sub), average = "weighted")

# add precision and recall ()
# compare models at end

In [None]:
#knjwer

## Naive Bayes

Key Assumption - conditional independence of x's
"Thus, if we know that warm weather improves our sales process, then this weather effect is conditionally independent from the age of our customers if this weather effect does not increase or decrease if the customer is younger or older."
- effect of x1 on y constant with changes in x2

In [None]:
X_train

### Clean Data

In [None]:
violations_train = X_train[["BORO", "CUISINE DESCRIPTION", "VIOLATION DESCRIPTION"]]
violations_test = X_test[["BORO", "CUISINE DESCRIPTION", "VIOLATION DESCRIPTION"]]

violations_train

In [None]:
def make_lower(text):
    return text.lower()

def replace_degree(text):
    return text.replace("°", "")

def split_text(text):
    return text.split(" ")

p = list(punctuation)
p.pop(12)
p = p + ['°']

def strip_punctuation(text, p = p):
    for x in p:
        text = text.replace(x, "")
    return text

STOP_WORDS = set(nltk.corpus.stopwords.words("english"))

def remove_stop_words(text_list):
    clean_text = []    
    for word in text_list:
        if word not in STOP_WORDS:
            clean_text.append(word)
    return clean_text

def combine_tokens(text_list):
    new_string = " ".join(text_list)
    return new_string


violations_train["VIOLATIONS DESCRIPTION"] = violations_train["VIOLATION DESCRIPTION"].apply(replace_degree)
violations_train["VIOLATIONS"] = violations_train["VIOLATION DESCRIPTION"].apply(make_lower)
violations_train["VIOLATIONS"] = violations_train["VIOLATIONS"].apply(strip_punctuation)
violations_train["VIOLATIONS"] = violations_train["VIOLATIONS"].apply(split_text)
violations_train["CLEAN VIOLATIONS"] = violations_train["VIOLATIONS"].apply(remove_stop_words)
violations_train["PROCESSED VIOLATIONS"] = violations_train["CLEAN VIOLATIONS"].apply(combine_tokens)
violations_train

In [None]:
violations_train["BORO"].value_counts()
# split and analyze by borough/boro

In [None]:
vec = CountVectorizer()
X = vec.fit_transform(violations_train["PROCESSED VIOLATIONS"])
matrix = pd.DataFrame(X.toarray(), columns = vec.get_feature_names_out(), index = violations_train.index)
matrix = matrix.iloc[:, (np.where(matrix.sum() >= 100)[0].tolist())]
matrix

In [None]:
matrix.sum().sort_values(ascending = False).head(30)

# words to drop: food, non, andor, may, properly, improperly, unacceptable, ...

In [None]:
matrix = matrix.drop(["food", "non", "andor", "may", "properly", "improperly", 
                      "unacceptable", "used"], axis = 1)
matrix.sum().sort_values(ascending = False).head(30)

In [None]:
# repeat with test data

violations_test["VIOLATIONS"] = violations_test["VIOLATION DESCRIPTION"].apply(make_lower)
violations_test["VIOLATIONS"] = violations_test["VIOLATIONS"].apply(strip_punctuation)
violations_test["VIOLATIONS"] = violations_test["VIOLATIONS"].apply(split_text)
violations_test["CLEAN VIOLATIONS"] = violations_test["VIOLATIONS"].apply(remove_stop_words)
violations_test["PROCESSED VIOLATIONS"] = violations_test["CLEAN VIOLATIONS"].apply(combine_tokens)


X2 = vec.fit_transform(violations_test["PROCESSED VIOLATIONS"])
matrix2 = pd.DataFrame(X2.toarray(), columns = vec.get_feature_names_out(), 
                       index = violations_test.index)
matrix2 = matrix2.drop(["food", "non", "andor", "may", "properly", "improperly", 
                        "unacceptable", "used"], axis = 1)
matrix2 = matrix2.iloc[:, (np.where(matrix2.sum() >= 25)[0].tolist())]

matrix2.sum().sort_values(ascending = False).head(30)

In [None]:
display([x for x in matrix2 if x not in matrix])

display([x for x in matrix if x not in matrix2])

missing = [x for x in matrix2 if x not in matrix]
matrix2 = matrix2.drop(missing, axis = 1)

In [None]:
alphas = np.linspace(0.001, 10, 10)

for alpha in alphas:
    nb_text = MultinomialNB(alpha = alpha, fit_prior = True)
    nb_text.fit(matrix, y_train["GRADE"])
    nb_pred = nb_text.predict(matrix2)
    print("Alpha = {}, \n{}".format(alpha, f1_score(grades, nb_pred, average = "weighted")))
    
best_alpha = 0.25 # or 6.667

In [None]:
best_nb_text = MultinomialNB(alpha = best_alpha)
best_nb_text.fit(matrix, y_train["GRADE"])
best_nb_pred = best_nb_text.predict_proba(matrix2)

nb_roc = roc_auc_score(grades, best_nb_pred, multi_class = "ovr")
nb_roc

In [None]:
plot_roc_curve("A", best_nb_pred)
plot_roc_curve("B", best_nb_pred)
plot_roc_curve("C", best_nb_pred)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
#vectorizer = TfidfVectorizer(stop_words = 'english')

def show_top10(classifier, vectorizer, categories):
    feature_names = vectorizer.columns
    #np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-15:]
        print("{}: {}".format(category, " ".join(feature_names[top10])))

show_top10(best_nb_text, matrix, y_train.GRADE.unique())

In [None]:
jhbkl

In [None]:
# LDA of most common words in classes (can get from nb??)
#from gensim.corpora import Dictionary

#gensim_dictionary = Dictionary(violations_train["CLEAN VIOLATIONS"])
#gensim_dictionary.token2id

In [None]:
#gensim_dictionary.filter_extremes(no_below=20, no_above=0.5)
corpus = []
#for doc in violations_train["CLEAN VIOLATIONS"]:
#    corpus.append(gensim_dictionary.doc2bow(doc))

#corpus

In [None]:
#from gensim.models import LdaModel

#NUM_TOPICS = 3
#CHUNKSIZE = 2000 
#PASSES = 20 # how many times to rerun algorithm to improve
#ITERATIONS = 800 # how many times to rerun algorithm to improve

# Make an index to word dictionary.
#temp = gensim_dictionary[0]  
#word_id = gensim_dictionary.id2token

#model = LdaModel(
    #corpus=corpus,
    #id2word=word_id,
    #chunksize=CHUNKSIZE,
    #alpha='auto',
    #eta='auto',
    #iterations=ITERATIONS,
    #num_topics=NUM_TOPICS,
    #passes=PASSES,
#)

#list_of_topic_tables = []
#for topic in model.show_topics(
    #num_topics=-1, num_words=10, formatted=False
#):
    #list_of_topic_tables.append(
        #pd.DataFrame(
            #topic[1],
            #columns=["Word" + "_" + str(topic[0]), "Prob" + "_" + str(topic[0])],
        #)
    #)
#list_of_topic_tables

In [None]:
slnds

In [None]:
X_train = X_train.drop(["VIOLATION DESCRIPTION"], axis = 1)
X_test = X_test.drop(["VIOLATION DESCRIPTION"], axis = 1)
X_train
# go back and drop Record & Grade Date
# 

In [None]:
cols = ['BORO', 'CUISINE DESCRIPTION','INSPECTION TYPE', 'INSPECTION PURPOSE']
X_train = pd.get_dummies(X_train, columns = cols)
X_test = pd.get_dummies(X_test, columns = cols)

In [None]:
X_train

In [None]:
dns

## SVM

Key Assumption - data is linearly separable
This means that there is a clear boundary between the different classes in the data. 

Another assumption - data is normalized. 
This means that the input features are scaled to have a mean of zero and a standard deviation of one.

In [None]:
# scaling vs normalizing

cols = ["Latitude", "Longitude", "CRITICAL FLAGS PER INSPECTION", "MONTHS OPERATING", 
        "TOTAL INSPECTIONS", "VIOLATIONS PER INSPECTION"]

for col in cols:
    sns.histplot(X_train[col])
    plt.title(col)
    plt.show()

In [None]:
num_cols = X_train.select_dtypes(include = ["float64", "int64"])\
.drop(["POST COVID", "RISKY BUSINESS"], axis = 1)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(num_cols)
scaled_violations = pd.DataFrame(scaled_features, index = num_cols.index, 
                                 columns = num_cols.columns)
scaled_violations

In [None]:
other_cols = X_train.drop(num_cols, axis = 1)
other_cols = other_cols.drop("VIOLATION DESCRIPTION", axis = 1)
other_cols = pd.get_dummies(other_cols)
clean_violations = other_cols.merge(scaled_violations, right_index = True, left_index = True)


#from sklearn.pipeline import make_pipeline
#scaled_features_test = X_test.select_dtypes(include = ["float64", "int64"])\
#.drop(["POST COVID", "RISKY BUSINESS"], axis = 1)

#clean_violations_test = make_pipeline(scaled_features_test)
#clean_violations_test

In [None]:
np.linspace(0.1, 10, 15)
clean_violations

In [None]:
parameters = {'C':np.linspace(0.1, 10, 5), 'gamma':['scale', 'auto'], 
              'class_weight':[None, 'balanced', {'A':1, 'B':3, 'C':3}], 
              'kernel':['linear', 'rbf', 'poly']}
svc = SVC(probability = True, random_state = SEED)
best_svc = GridSearchCV(svc, parameters, cv = kfold)
best_svc.fit(X_train_sub, y_train["GRADE"])
best_svc.best_params_

In [None]:
knkn

In [None]:
#from sklearn.svm import SVC

#reg_params = np.linspace(0.1, 10, 3)
#gammas = ["scale", "auto"]
#class_weights = [None, "balanced"]

#for param in reg_params:
    #for gamma in gammas:
        #for weight in class_weights:
            #svc = SVC(C = param, gamma = gamma, class_weight = weight, probability = True, 
                      #random_state = SEED)
            #svc.fit(clean_violations, y_train["GRADE"])
            #svc_pred = svc.predict(X_test)
            #svc_pred = svc.predict_proba(X_test)
            #print("Weight = {}, Gamma = {}, C = {}, \n{}".format(weight, gamma, param,
                                                                 #classification_report(grades, 
                                                                                       #svc_pred, 
                                                                                       #zero_division = 0.)))
            
# zero_division
#svc_auc = roc_auc_score(grades, best_svc_pred, multi_class = "ovr")

In [None]:
# penalized SVM uses C != 1.0

