# CPSC6300 Final Exam Problem 37
# Author: Wenkang Wei

# Download Dataset

In [1]:
train_url = "https://www.palmetto.clemson.edu/dsci/datasets/exams/exam_train.csv"
test_url = "https://www.palmetto.clemson.edu/dsci/datasets/exams/exam_test.csv"

In [2]:
import pandas as pd
import requests
import numpy as np
import os

def download_data(data_url, file_path="."):
    r = requests.get(data_url, verify=False)
    with open(file_path, "wb") as f:
        f.write(r.content)
def load_data(data_url, local_cached_datafile):
    if not os.path.exists(local_cached_datafile):
        if not os.path.exists(os.path.dirname(local_cached_datafile)):
            os.makedirs(os.path.dirname(local_cached_datafile))
        download_data(data_url, local_cached_datafile)
    return pd.read_csv(local_cached_datafile)

train_input_dir = os.path.join(os.getcwd(), 'exam_train.csv')
test_input_dir = os.path.join(os.getcwd(), 'exam_test.csv')
# download training set and test set
download_data(train_url, train_input_dir)
download_data(train_url, test_input_dir)




In [3]:
train_df = pd.read_csv("exam_train.csv")
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  112 non-null    float64
 1   sepal_width   112 non-null    float64
 2   petal_length  112 non-null    float64
 3   petal_width   112 non-null    float64
 4   class         112 non-null    object 
dtypes: float64(4), object(1)
memory usage: 4.5+ KB


In [4]:
test_df = pd.read_csv("exam_test.csv")
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  112 non-null    float64
 1   sepal_width   112 non-null    float64
 2   petal_length  112 non-null    float64
 3   petal_width   112 non-null    float64
 4   class         112 non-null    object 
dtypes: float64(4), object(1)
memory usage: 4.5+ KB


In [5]:
train_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,6.2,3.4,5.4,2.3,Iris-virginica
1,6.7,3.3,5.7,2.5,Iris-virginica
2,5.8,4.0,1.2,0.2,Iris-setosa
3,5.7,3.0,4.2,1.2,Iris-versicolor
4,6.3,2.5,5.0,1.9,Iris-virginica


# Get target and trainingset, test set

In [6]:
y_train = train_df["class"].astype("category")
X_train = train_df.drop(columns=["class"])

y_test = test_df["class"].astype("category")
X_test = test_df.drop(columns=["class"])

In [7]:
y_train.head()

0     Iris-virginica
1     Iris-virginica
2        Iris-setosa
3    Iris-versicolor
4     Iris-virginica
Name: class, dtype: category
Categories (3, object): [Iris-setosa, Iris-versicolor, Iris-virginica]

# Model 1: Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr_clf = LogisticRegression(random_state=0, max_iter =200)
lr_model = lr_clf.fit(X_train, y_train)

# Training Accuracy

In [9]:
preds_train = lr_model.predict(X_train)
accuracy_score(preds_train, y_train)

0.9821428571428571

# Testing Accuracy

In [10]:
preds_test = lr_model.predict(X_test)
possib_test = lr_model.predict_proba(X_test)
accuracy_score(preds_test, y_test)

0.9821428571428571

# Print wrong prediction results 

In [11]:
def print_wrong_samples(model,X ,y):
    # make prediction
    predictions = model.predict(X)
    possibility = model.predict_proba(X)
    possib_dic = {}
    # convert possibility to dataframe
    for i, name in enumerate(model.classes_):
        possib_dic[name+"_possibility"] = possibility[:, i]
        
    possib_df = pd.DataFrame(possib_dic)
    
    # find the samples that are misclassified
    possib_df = possib_df[predictions!=y]
    df = pd.DataFrame({"wrong sample":predictions[predictions!=y],
                       "target":y[predictions!=y]})
    df = pd.concat([df, possib_df], axis=1)
    return df


In [12]:
model1_results = print_wrong_samples(lr_model, X_test,y_test)
model1_results

Unnamed: 0,wrong sample,target,Iris-setosa_possibility,Iris-versicolor_possibility,Iris-virginica_possibility
36,Iris-virginica,Iris-versicolor,0.003206,0.485161,0.511634
57,Iris-virginica,Iris-versicolor,0.000713,0.364304,0.634983


# Model 2: Linear SVC

In [13]:
from sklearn.svm import SVC
svc_clf = SVC(random_state=0,probability=True, kernel="linear")
svc_clf = svc_clf.fit(X_train, y_train)

# Training Accuracy

In [14]:
preds_train = svc_clf.predict(X_train)
accuracy_score(preds_train, y_train)


0.9910714285714286

# Testing Accuracy

In [15]:
preds_test = svc_clf.predict(X_test)
accuracy_score(preds_test, y_test)

0.9910714285714286

# Print wrong prediction results

In [16]:
model2_results = print_wrong_samples(svc_clf, X_test,y_test)
model2_results

Unnamed: 0,wrong sample,target,Iris-setosa_possibility,Iris-versicolor_possibility,Iris-virginica_possibility
57,Iris-virginica,Iris-versicolor,0.01966,0.247025,0.733314
