# Practical Exercises 
For practicing the numpy skills learned so far 

## 1. Working with API Image Data in Numpy and Matplotlib
- Write a program to download the images from the [Metropolitan Museum of Art API](https://metmuseum.github.io)

- Generate an image (like the one shown below) by plotting random images from the collection

<img src="../assets/met_example_image.png" alt="" width="500"/>

In [None]:
import os
import random
import json
import requests
import cv2
import urllib
import http
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
from tqdm.notebook import tqdm

def get_images():
    # getting all of the objects
    all_objects_request = requests.get("https://collectionapi.metmuseum.org/public/collection/v1/objects")
    all_objects = all_objects_request.json()
    # for each object, getting its data
    object_ids_list = all_objects['objectIDs']
    for i in tqdm(sorted(object_ids_list)):
        # getting request response for each object
        individual_object_request = requests.get(f"https://collectionapi.metmuseum.org/public/collection/v1/objects/{i}")
        individual_object_data = individual_object_request.json()
        # not all items have an image.
        # if item has an image, saving it as a .jpg
        if bool(individual_object_data['primaryImageSmall']):
            # some image links contain a space and are not readable.
            if " " in individual_object_data['primaryImageSmall']:
                continue
            else:
                try:
                    # for readable links, retrieving the image and saving it.
                    urllib.request.urlretrieve(individual_object_data['primaryImageSmall'],
                                               f"./data/images/image_object_id_{i}.jpg")
                # dealing with error in some of the file paths
                except (urllib.error.HTTPError, http.client.RemoteDisconnected, UnicodeError) as e:
                    continue
        else:
            continue
        
get_images()

In [None]:
def load_images(path):
    # Helper function for loading the image 
    image_files = sorted([os.path.join(path, file)
         for file in os.listdir(path) if file.endswith('.jpg')])
    return image_files


def plot_images():
    # loading the image arrays
    image_path = "./data/images/"
    image_files = load_images(image_path)
    
    item_image_indexes = []
    item_image_files = []
    for file in image_files:
        start = file.find("id_") + len("id_")
        end = file.find(".jpg")
        substring = file[start:end]
        item_image_files.append(file)
        item_image_indexes.append(int(substring))

    image_arrays = []
    for image in item_image_files[:30000]:
        # using openCV for the pre-processing and resizing
        image_arrays.append(cv2.resize(cv2.cvtColor(cv2.imread(image), cv2.COLOR_BGR2RGB), (200,200)))

    # making the plots
    fig = plt.figure(figsize=(50, 40))
    grid = ImageGrid(fig, 111,
                     nrows_ncols=(5, 10),
                     axes_pad=0,
                     )
    plt.axis('off')
    sample = np.random.choice(np.arange(len(image_arrays)), 50)
    example_images = []
    for i in sample:
        example_images.append(image_arrays[i])
    for ax, im in zip(grid, example_images):
        # Iterating over the grid returns the Axes.
        ax.imshow(im)
        ax.grid(False)

        # Hide axes ticks
        ax.set_xticks([])
        ax.set_yticks([])

    fig.savefig(f'./data/images/image_grid.png', bbox_inches='tight')
    plt.close(fig)
    return

plot_images()

# 2. Regression factors
The formula for the regression coefficients is

$\beta = (X'X)^{(-1)}X'Y $

But the data is a bit messed up, meaning that the format of the independent variables are saved in a flat array. That means we have a 1xN vector, i.e. the data was changed from that: 

<img src="../assets/data_before.png" alt="" width="500"/>

to that:

<img src="../assets/data_after.png" alt="" width="700"/>

The array contains the following variables: 

- Sale (in Dollars) - Amount of money received by the store
- Pack Size - Number of bottles per item
- State Bottle Cost - Cost of producing the bottle 
- Packs Sold - Amount of bottles sold
- Bottle Volume (in ml) - How many ml each bottle has



Question: Determine the regression coefficients of the following OLS regression

$Sale = \beta_0 + \beta_1 * (Pack Size) + \beta_2 * (State Bottle Cost) + \beta_3 * (Packs Sold) + \beta_4 * (Bottle Volume) + \epsilon $

In [None]:
def beta_coefficients(): 
    # Loading the data
    with open('../data/data.pkl','rb') as f:
        data = pickle.load(f)

    # Loading the Column names 
    column_names = ["Sale (in Dollars)", "Pack Size", "State Bottle Cost", "Packs Sold", "Bottle Volume (in ml)"]

    # Reshaping array from a 1x500000 format to a 5x100000 format
    reshaped_data = data.reshape(100_000,-1)

    # Changing the string varibles to floats
    float_data = reshaped_data.astype(np.float)

    # Separating the Sale variable from the rest
    independent = float_data[:,1:]
    Y = float_data[:,0]

    # Creating a column with only ones and add that to the numpy array as a column (this is done for the intercept)
    ones = np.ones(independent.shape[0])
    X = np.c_[ones, independent]

    # Applying regression coefficient formula
    X_prime = np.transpose(X)  

    inverse_part = np.linalg.inv(np.dot(X_prime, X))
    X_prime_Y = np.dot(X_prime, Y)
    beta = np.dot(inverse_part, X_prime_Y)

    # Printing the coefficients and the name of the regressor 
    return beta

In [None]:
def stats_package():
    # Loading the data
    with open('../data/data.pkl','rb') as f:
        data = pickle.load(f)

    # Loading the Column names 
    column_names = ["Sale (in Dollars)", "Pack Size", "State Bottle Cost", "Packs Sold", "Bottle Volume (in ml)"]

    # Reshaping array from a 1x500000 format to a 5x100000 format
    reshaped_data = data.reshape(100_000,-1)

    # Changing the string varibles to floats
    float_data = reshaped_data.astype(np.float)

    # Separating the Sale variable from the rest
    independent = float_data[:,1:]
    Y = float_data[:,0]

    # Creating a column with only ones and add that to the numpy array as a column (this is done for the intercept)
    ones = np.ones(independent.shape[0])
    X = np.c_[ones, independent]
    
    # Defining statistical model
    model = sm.OLS(Y, X)
    
    # Fitting the results
    results = model.fit()
    
    # Printing the entire OLS summary statistics
    return results.summary()