# CMSC 173 - MP 2
## Instructions:
1. Create an overview of the problem being solved, e.g., what was the story behind the collection of the data, description of the attributes/features used,etc.
2. (Data Preprocessing and Exploratory Analysis) Present descriptive statistics as applicable (e.g., distribution, central tendency, variability) of the data before training the models. Clean the data if there are missing values, etc. You may perform feature engineering (i.e., creating new features out of the given features), but be sure to document your justifications. 
3. Split your data into proportions of 70% training set and 30% testing set.
4. Train the following models: (a) logistic regression classifier and (b) naive Bayes classifier on the dataset.
5. Evaluate the performance of the trained model. You may use additional performance measures if you want, but for now I will only require the calculation of the accuracy. The accuracy measures the fraction of correct classifications. With this, you need to generate the confusion matrix. You may read this if you haven't encountered this concept before: https://www.sciencedirect.com/topics/engineering/confusion-matrix#:~:text=A%20confusion%20matrix%20represents%20the,by%20model%20as%20other%20class. Remember to compute this matrix from the test set (not the training set).

In [2]:
using Random
using StatsBase
using CSV
using DataFrames 
using Plots
using Base

In [3]:
dataset = CSV.read("passenger_flight.csv",DataFrame)
Random.seed!(123)
dataset = dataset[shuffle(axes(dataset, 1)), :]

Row,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64?,Int64
1,1,1,50,1,1,3744,5,5,5,5,3,4,4,4,4,4,4,5,4,3,0,0,1
2,0,1,53,1,1,2661,4,5,5,5,3,1,2,4,4,3,4,4,4,2,6,8,0
3,1,1,20,0,0,541,2,4,2,3,4,2,4,4,2,2,4,2,3,4,38,38,0
4,0,1,52,0,1,944,1,2,1,2,2,3,2,2,2,1,2,1,2,2,34,48,0
5,1,1,33,1,1,406,1,1,1,1,4,4,4,4,3,5,4,3,5,4,0,0,1
6,0,1,51,0,0,621,2,4,2,1,2,4,4,3,3,2,3,5,3,3,0,0,0
7,1,1,25,1,1,3547,2,2,2,2,5,5,5,5,5,5,1,4,4,5,0,0,1
8,0,1,51,1,1,547,4,4,4,4,2,4,5,4,4,4,4,3,4,5,0,0,1
9,0,1,60,0,1,438,2,4,2,3,2,4,4,5,5,2,5,5,5,3,0,0,0
10,1,1,26,1,1,2085,1,1,1,1,5,5,5,5,4,5,5,3,4,5,37,23,1


## Data Preprocessing

In [4]:
# REMOVE MISSING
has_missing = .!completecases(dataset)

# check rows with missing values
rows_with_missing_values = dataset[has_missing, :] # 83 rows have missing values in the Arrival Delay in Minutes column
    
# remove missing values since it is difficult to fill the missing values
dataset = dataset[.!has_missing, :]

# rename column names
col_names = names(dataset)
new_col_names = map(lowercase, String.(col_names)) # convert to lower case
new_col_names .= replace.(new_col_names, " "=>"_", "-"=>"", "/"=>"_") # replace spaces and slash with underscore and remove dashes, 
rename!(dataset, new_col_names)

Row,gender,customer_type,age,type_of_travel,class,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,gate_location,food_and_drink,online_boarding,seat_comfort,inflight_entertainment,onboard_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,satisfaction
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64?,Int64
1,1,1,50,1,1,3744,5,5,5,5,3,4,4,4,4,4,4,5,4,3,0,0,1
2,0,1,53,1,1,2661,4,5,5,5,3,1,2,4,4,3,4,4,4,2,6,8,0
3,1,1,20,0,0,541,2,4,2,3,4,2,4,4,2,2,4,2,3,4,38,38,0
4,0,1,52,0,1,944,1,2,1,2,2,3,2,2,2,1,2,1,2,2,34,48,0
5,1,1,33,1,1,406,1,1,1,1,4,4,4,4,3,5,4,3,5,4,0,0,1
6,0,1,51,0,0,621,2,4,2,1,2,4,4,3,3,2,3,5,3,3,0,0,0
7,1,1,25,1,1,3547,2,2,2,2,5,5,5,5,5,5,1,4,4,5,0,0,1
8,0,1,51,1,1,547,4,4,4,4,2,4,5,4,4,4,4,3,4,5,0,0,1
9,0,1,60,0,1,438,2,4,2,3,2,4,4,5,5,2,5,5,5,3,0,0,0
10,1,1,26,1,1,2085,1,1,1,1,5,5,5,5,4,5,5,3,4,5,37,23,1


In [6]:
# split dataframe into 2 df depending on pct
function splitdf(df, pct)
    @assert 0 <= pct <= 1
    ids = collect(axes(df, 1))
    shuffle!(ids)
    sel = ids .<= nrow(df) .* pct
    train = view(df, sel, :)
    test = view(df, .!sel, :)

    # println(hcat(train[:,1:end-1], DataFrame("satisfaction"=>train[:,end])) == train)

    return train[:,1:end-1], DataFrame("satisfaction"=>train[:,end]), test[:,1:end-1], DataFrame("satisfaction" => test[:,end])
end

(x_train, y_train, x_test, y_test) = splitdf(dataset, 0.7)

([1m18125×22 DataFrame[0m
[1m   Row [0m│[1m gender [0m[1m customer_type [0m[1m age   [0m[1m type_of_travel [0m[1m class [0m[1m flight_distance [0m[1m[0m ⋯
       │[90m Int64  [0m[90m Int64         [0m[90m Int64 [0m[90m Int64          [0m[90m Int64 [0m[90m Int64           [0m[90m[0m ⋯
───────┼────────────────────────────────────────────────────────────────────────
     1 │      1              1     50               1      1             3744  ⋯
     2 │      0              1     52               0      1              944
     3 │      1              1     33               1      1              406
     4 │      1              1     25               1      1             3547
     5 │      0              1     51               1      1              547  ⋯
     6 │      0              1     60               0      1              438
     7 │      1              1     26               1      1             2085
     8 │      1              1     17             

## Naive Bayes

In [11]:
# build conditional probability table

cont_col_names = ["age", "flight_distance", "departure_delay_in_minutes", "arrival_delay_in_minutes"]
disc_col_names = [name for name in names(dataset) if name ∉ cont_col_names && name≠"satisfaction"]
train = hcat(x_train, y_train)

# calculate discrete probabilities
function count_disc_prob(df, col_name)
    return combine(groupby(df, [col_name, "satisfaction"]), nrow)
end

cond_prob_table = Dict()
for name in disc_col_names
    cond_prob_table[name] = count_disc_prob(train, name)
end

# calculate continuous probabilities
function count_cont_prob(df, col_name)
    a = combine(groupby(df, "satisfaction"), [col_name] => mean, [col_name] => std)
end

for name in cont_col_names
    cond_prob_table[name] = count_cont_prob(train, name)
end

[1m2×3 DataFrame[0m
[1m Row [0m│[1m satisfaction [0m[1m age_mean [0m[1m age_std [0m
     │[90m Int64        [0m[90m Float64  [0m[90m Float64 [0m
─────┼─────────────────────────────────
   1 │            0   38.0589  16.5255
   2 │            1   41.6782  12.955


In [12]:
# calculate likelihood for continuous data
function likelihood(cond_prob_table, feature, satisfaction, x)
    feature_table = cond_prob_table[feature]
    prob_values = filter(row -> row.satisfaction == satisfaction, feature_table)

    # get mean and variance
    μ = prob_values[1,2]
    σ = prob_values[1,3]

    return (1/(σ * sqrt(2π))) * exp((-1/2) * ((x-μ)/σ)^2)
end

# calculate probabilities for discrete (categorical) data
function disc_cond_prob(cond_prob_table, feature, satisfaction, x)
    feature_table = cond_prob_table[feature]
    feature_table = filter(row -> row.satisfaction==satisfaction, feature_table)
    total = sum(feature_table[:,:nrow])
    
    val = 0
    try
        val = filter(row -> row[feature] == x, feature_table)[1,end]
    catch
        val = 0
    end

    # apply laplace smoothing
    return (val+1)/(total+1)
end

# run test
function test()
    correct = 0
    not_correct = 0

    # iterate all training data
    for i in 1:size(x_train)[1]
        test_case = x_train[i,:]
        p_satisfied_proportional = 1
        p_not_satisfied_proportional = 1

        # get probabilities of all features
        for col_name in names(test_case)

            # treat discrete and continuous features separately
            if col_name ∈ disc_col_names
                p_satisfied_proportional *= disc_cond_prob(cond_prob_table, col_name, 1, test_case[col_name])
                p_not_satisfied_proportional *= disc_cond_prob(cond_prob_table, col_name, 0, test_case[col_name])
            else
                p_satisfied_proportional *= likelihood(cond_prob_table, col_name, 1, test_case[col_name])
                p_not_satisfied_proportional *= likelihood(cond_prob_table, col_name, 0, test_case[col_name])
            end
        end

        # calculate probabilities
        p_satisfied = (p_satisfied_proportional / (p_satisfied_proportional+p_not_satisfied_proportional))
        p_not_satisfied = (p_not_satisfied_proportional / (p_satisfied_proportional+p_not_satisfied_proportional))
        
        # count correct and incorrect predictions
        if (p_satisfied > p_not_satisfied && y_train[i,1] == 1) || (p_satisfied < p_not_satisfied && y_train[i,1] == 0)
            correct += 1
        else
            not_correct += 1
        end
    end

    println("Correct predictions: ", correct)
    println("Incorrect predictions: ", not_correct)
    println("Accuracy: ", (correct / (correct + not_correct))*100)
end

test()


Correct predictions: 15924
Incorrect predictions: 2201
Accuracy: 87.85655172413793
