In [86]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score

import pickle

<h2> Full Functions <h2>

In [87]:
class Model:
    def __init__ (self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs
        self.types_orders = ["Fried Catfish", "Sugar Cream Pie", "Indiana Pork Chili", 
                "Indiana Corn Cob", "Buffalo Tacos", "Sweet Potato Fries",
                "Grilled Cheese", "Pork Tenderloin Sandwich", "Cornbread Hushpuppies",
                "BBQ Pulled Pork Sandwich"]

    def load_dataset(self, file):
        df = pd.read_csv(file)
        #print(df)
        print(df.shape)

        #Changes strings into integers that can be used for multiclass categorizing
        for input in self.inputs:
            df[input]=df[input].astype('category').cat.codes
        df[self.outputs]=df[self.outputs].astype('category').cat.codes
        print(df)

        return df
    
    def train(self, df):
        orders_X, orders_Y = df[self.inputs].values, df[self.outputs].values
        # Split data 70% into train and 30% into test
        x_orders_train, x_orders_test, y_orders_train, y_orders_test = train_test_split(orders_X, 
                                                                                        orders_Y, 
                                                                                        test_size = 0.30,
                                                                                        random_state = 0,
                                                                                        stratify=orders_Y)

        columns = [0,1,2,3]
        transformer = Pipeline(steps=[
                        ('scaler', StandardScaler())
                        ])

        preprocessor = ColumnTransformer(
                        transformers=[('preprocess', transformer, columns)])

        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', SVC(probability=True))])
        model = pipeline.fit(x_orders_train, y_orders_train)
        predictions = model.predict(x_orders_test)

        print (model)   
        print('Predicted labels: ', predictions[:30])
        print('Actual labels   : ' ,y_orders_test[:30])
        
        print("Overall Accuracy:",accuracy_score(y_orders_test, predictions))
        print("Overall Precision:",precision_score(y_orders_test, predictions, average='weighted'))
        return pickle.dumps(model)

    def predict(self, model, inputs):
        inputs = np.array(inputs)
        
        pred = model.predict(inputs)[0]
        print("Prediction: ", self.types_orders[pred])

<h2>Breakdown<h5>

<h4> Process data function </h4>

    def load_dataset(self, file):
        df = pd.read_csv(file)
        print(df)
        print(df.shape)

        for input in self.inputs:
            df[input]=df[input].astype('category').cat.codes
        df[self.outputs]=df[self.outputs].astype('category').cat.codes

        return df

Takes in the file name and puts it into a dataframe. Since 4/5 of the variables are strings, I took each one of the columns and labeled each one of the inputs into a number with cat.codes. This way, the model could take inputs in as numbers and not strings.

<h4> Training <h4>



When choosing what type of model I wanted, it had to be something that was able to take in multiple inputs and classify into a single output. At Sklearn's website, I found Multiclass Classification as my main model. All I had to do now was to decide what type of methods I would use to regularize the parameters. The one on the website had Logistic Regression, while another one I found had Singular Value Decomposition. So I made two models and chose the one to choose based on the higher accuracy. In the end, SVD had 55% accuracy while Logistic Regression only had 44%.

        Code:
        orders_X, orders_Y = df[self.inputs].values, df[self.outputs].values
        # Split data 70% into train and 30% into test
        x_orders_train, x_orders_test, y_orders_train, y_orders_test = train_test_split(orders_X, 
                                                                                        orders_Y, 
                                                                                        test_size = 0.30,
                                                                                        random_state = 0,
                                                                                        stratify=orders_Y)

        columns = [0,1,2,3]
        transformer = Pipeline(steps=[
                        ('scaler', StandardScaler())
                        ])

        preprocessor = ColumnTransformer(
                        transformers=[('preprocess', transformer, columns)])

        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', SVC(probability=True))])
        model = pipeline.fit(x_orders_train, y_orders_train)
        predictions = model.predict(x_orders_test)

        print (model)   
        print('Predicted labels: ', predictions[:30])
        print('Actual labels   : ' ,y_orders_test[:30])
        
        print("Overall Accuracy:",accuracy_score(y_orders_test, predictions))
        print("Overall Precision:",precision_score(y_orders_test, predictions, average='weighted'))
        return pickle.dumps(model)


I first split the training and test data into a 70/30 split for training. Then, I scaled the columns of the data and put it into a pipline, where the model could fit into SVD. This reduced the parameters and made my model less overfit. Then, I printed out the results and returned the pickled model.

<h2> Running Code <h2>

In [88]:
inputs = ["Year", "Major", "University", "Time"]
output = "Order"
file = "Xtern_traindata.csv"

model = Model(inputs, output)
df= model.load_dataset(file)

(5000, 5)
      Year  Major  University  Time  Order
0        1     16           3     4      2
1        2      5           0     6      7
2        2      5           1     4      6
3        1      3           3     3      2
4        2      4           1     4      5
...    ...    ...         ...   ...    ...
4995     1     12           0     3      0
4996     2      2           3     4      9
4997     2      5           1     5      7
4998     2      2           1     7      7
4999     1      7           8     7      0

[5000 rows x 5 columns]


In [89]:
trained_pickled_model = model.train(df)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('preprocess',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  [0, 1, 2, 3])])),
                ('regressor', SVC(probability=True))])
Predicted labels:  [2 9 7 0 7 7 3 5 7 6 5 5 7 3 5 5 7 0 7 3 5 4 8 0 5 7 5 5 8 7]
Actual labels   :  [2 9 7 0 7 7 3 0 7 6 5 5 7 1 5 5 7 0 9 8 6 4 9 0 1 5 4 5 8 7]
Overall Accuracy: 0.5573333333333333
Overall Precision: 0.5668067664703302


In [90]:
#Inputs: Year 2, Astronomy, Bulter University, 12
inputs = [[1, 2, 1, 4]]
model.predict(pickle.loads(trained_pickled_model), inputs)

#Note: No idea if this is right

Prediction:  Grilled Cheese


<h2>Considerations<h2>

If this is suitable, then I would test this model a lot more, repeatedly checking the predictions of the model to people's actual orders. Depending on it's success, then I would deploy the model for actual use. The model choise would be another consideration, as the one I've coded here is only 55% successful. If I could've found a better model or used a different method to train the model, there could've been a higher accuracy. The last thing I would consider is the scaling of my values. Because this model is meant to predict what someone would order, when I scaled my values, it might have overgeneralized too much and made my accuracy lower than it should've been.