# Pattern Recognition and Machine Learning
## Week 3 Tutorial

### 1. Read a csv file

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Exploring the dataset
iris = pd.read_csv("data/iris.data", sep=',', names=["sepal_length",
                                                     "sepal_width",
                                                     "petal_length",
                                                     "petal_width",
                                                     "species"])

### 2. Explore the data

In [None]:
iris.head()

In [None]:
iris.info()

In [None]:
iris.isnull().sum()

In [None]:
iris["species"].value_counts()

In [None]:
iris["species"]

### 3. Visualize data using Matplotlib

In [None]:
import seaborn as sns

In [None]:
# pie chart
iplot = iris["species"].value_counts()\
                       .plot(kind='pie', autopct="%.2f", figsize=(8, 8))
iplot.set_ylabel('')
iplot.set_facecolor('white')

In [None]:
# boxplot
iris.boxplot(by="species", figsize=(12, 6))

In [None]:
# scatterplot
sns.set(style='darkgrid')
sc = iris[iris.species == "Iris-setosa"].plot(kind='scatter', x="sepal_length",
                                              y="sepal_width", color='red',
                                              label="Setosa")

iris[iris.species == "Iris-versicolor"].plot(kind='scatter', x="sepal_length",
                                              y="sepal_width", color='green',
                                              label="Versicolor", ax=sc)

iris[iris.species == "Iris-virginica"].plot(kind='scatter', x="sepal_length",
                                              y="sepal_width", color='orange',
                                              label="Virginica", ax=sc)


sc.set_xlabel("Sepal Length in cm")
sc.set_ylabel("Sepal Width in cm")
sc.set_title("Sepal Length vs Sepal Width")

### 4. Linear regression on a multi-variable dataset

In [None]:
import numpy as np

# metrics to evaluate
from sklearn.metrics import mean_squared_error, mean_absolute_error

# split the dataset into train/test sets
from sklearn.model_selection import train_test_split

# import linear regression
from sklearn.linear_model import LinearRegression

In [None]:
# drop petal width from dataset
X = iris.drop(labels=["petal_width", "species"], axis='columns')

# correct values
y = iris["petal_width"]

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state=1)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
lre = LinearRegression()

# train the model
lre.fit(X_train, y_train)

# make a prediction
result = lre.predict(X_test)

# evaluate the result
print("Mean Absolute Error:", mean_absolute_error(y_test, result))
print("Mean Squared Error:", mean_squared_error(y_test, result))
print("Mean Squared Root Error:", np.sqrt(mean_squared_error(y_test, result)))
abs(y_test - result)

In [None]:
# visualize the result
plt.scatter(X_test[["sepal_length"]], y_test,
            color="red", label = "Actual petal_width")
plt.scatter(X_test[["sepal_length"]], result,
            color="yellow", label = "Actual petal_width")
plt.legend()
plt.xlabel("Sepal Length")

In [None]:
compare = pd.DataFrame(y_test)
compare["predict"] = np.round(result, 1)
compare["bias"] = compare["petal_width"] - compare["predict"]

print(compare)

### 5. Linear regression on Wine Quality dataset

In [None]:
# load wine dataset
wine_red_data = pd.read_csv("data/winequality-red.csv", sep=';')
wine_white_data = pd.read_csv("data/winequality-white.csv", sep=';')

wine_dataset = pd.concat([wine_red_data, wine_white_data])

In [None]:
wine_dataset.head(5)

In [None]:
list(wine_dataset.columns)

In [None]:
wine_dataset.info()

In [None]:
# drop quality width from dataset
X = wine_dataset.drop(columns="quality")

# correct values
y = wine_dataset["quality"]

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state=1)

In [None]:
lre = LinearRegression()

# train the model
lre.fit(X_train, y_train)

# make a prediction
result = lre.predict(X_test)

# evaluate the result
print("Mean Absolute Error:", mean_absolute_error(y_test, result))
print("Mean Squared Error:", mean_squared_error(y_test, result))
print("Mean Squared Root Error:", np.sqrt(mean_squared_error(y_test, result)))
# abs(y_test - result)

In [None]:
# compare result
print(pd.DataFrame({"Actual": y_test, "Predict": result}))

In [None]:
# print weight and coefficient
print(lre.intercept_)
print(lre.coef_)

In [None]:
# visualize the result
plt.scatter(X_test[["citric acid"]], y_test,
            color="red", label = "Actual wine quality")
plt.scatter(X_test[["citric acid"]], result,
            color="yellow", label = "Actual wine quality")
plt.legend()
plt.xlabel("Volatile Acidity")

## Question:
1. Investigate an effect of testing size to model’s performance;
2. Build a model to train and make a prediction for your selected dataset.

In [None]:
X = wine_dataset.drop(["quality", "pH", "alcohol", "residual sugar"],
                      axis="columns")

y = wine_dataset["quality"]

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state=1)

In [None]:
lre = LinearRegression()

# train the model
lre.fit(X_train, y_train)

# make a prediction
result = lre.predict(X_test)

# evaluate the result
print("Mean Absolute Error:", mean_absolute_error(y_test, result))
print("Mean Squared Error:", mean_squared_error(y_test, result))
print("Mean Squared Root Error:", np.sqrt(mean_squared_error(y_test, result)))
# abs(y_test - result)

In [None]:
# compare result
compare = pd.DataFrame({"Actual": y_test, "Predict": result})
compare["bias"] = compare["Actual"] - compare["Predict"]
print(compare)

# Modified National Institute of Standards and Technology database (MNIST)
## Using Logistic Regression to classify images of numbers in MNIST images

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import fetch_openml
# from sklearn.datasets import load_digits

# load images
digits = fetch_openml("mnist_784")

# digits = load_digits()
# print(digits.data.shape)

In [None]:
plt.gray() 
plt.matshow(digits.images[3]) 
plt.show() 

## 3 Exploring Data
### 3.1 Show the image and data

In [None]:
plt.figure(figsize=(10,6))
for index, (image, label) in enumerate(zip(digits.data[0:10],
                                           digits.target[0:10])):
    plt.subplot(2, 5, index + 1)
    plt.imshow(np.reshape(image, (8, 8)), cmap=plt.cm.gray)
    plt.title('Training: %i\n' % int(label), fontsize = 15)

### 3.2 Show the corresponding matrix

In [None]:
# show corresponding matrix
digits.data[0]

## 4 Build a Logistic model to classify MNIST images

In [None]:
from sklearn.metrics import confusion_matrix

### 4.2 Step 1: Select target variable

In [None]:
y = digits.target
y[0]

### 4.3 Step 2: Prepare data

In [None]:
n_samples = len(digits.data)
n_samples

In [None]:
X = digits.data
X.shape

In [None]:
print(X)

### 4.4 Step 3: Split dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state=1)

### 4.5 Step 4: Select a Linear Regression classifier

In [None]:
lgr = LogisticRegression(solver = 'lbfgs', max_iter=5000)

### 4.6 Step 5: Fit the data

In [None]:
lgr.fit(X_train, y_train)

### 4.7 Step 6: Predict on test data

In [None]:
result = lgr.predict(X_test)

### Measuring Model Performance:
accuracy (fraction of correct predictions) =
correct predictions / total number of data points

In [None]:
score = lgr.score(X_test, y_test)
print(score)

### 4.8 Step 7: Create confusion matrix

In [None]:
confusion_matrix(y_test, result)

### 4.9 Visualize the prediction

In [None]:
images_and_predictions = list(zip(digits.data, lgr.predict(X)))

In [None]:
plt.figure(figsize=(10, 10))
for index, (image, prediction) in enumerate(images_and_predictions[:20]):
  plt.subplot(4, 5, index+1)
  plt.axis("off")
  plt.imshow(np.reshape(image, (8,8)), cmap=plt.cm.gray_r,
             interpolation='nearest')
  plt.title('Prediction: %i' % int(prediction))
plt.show()

Display misclassified images:

In [None]:
index = 0
misclassifiedIndexes = []
for label, predict in zip(y_test, result):
  if label != predict:
    misclassifiedIndexes.append(index)
  index += 1

In [None]:
plt.figure(figsize=(20,15))
for plotIndex, badIndex in enumerate(misclassifiedIndexes[0:10]):
  plt.subplot(4, 5, plotIndex + 1)
  plt.axis("off")
  plt.imshow(np.reshape(X_test[badIndex], (8,8)), cmap=plt.cm.gray,
             interpolation='nearest')
  plt.title('Predicted: {}, Actual: {}'.format(result[badIndex], 
                                               y_test[badIndex]),
                                               fontsize = 15)