# MSDS 534: Statistical Learning - Homework 4

### NAME:  _______

### NET ID:  _______

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import TensorDataset, DataLoader
import warnings
import itertools
from transformers import BertTokenizer, BertModel


warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


## Movie data
* each movie has a short "overview" text description
* movie is assigned genres


### key attributes:
* genres in genres_to_predict
* bert features of "overview" stores in bert_feats columns

In [2]:
movies_df = pd.read_csv('movies_embedding.csv')

## shuffle the rows
movies_df = movies_df.sample(frac=1, random_state=42).reset_index(drop=True)

genres_to_predict = ['Action', 'Adventure', 'Comedy', 'Drama', 'Romance',
                        'Science Fiction', 'Crime', 'Horror', 
                        'Fantasy', 'Animation']

BERT_LEN = 768
bert_feats = ['bert_'+str(i) for i in range(BERT_LEN)]

print(movies_df[["title", "overview"] + genres_to_predict].head())



                         title  \
0            The Shipping News   
1                Jack and Jill   
2                    Show Boat   
3  The Man with the Iron Fists   
4                     Red Dawn   

                                            overview  Action  Adventure  \
0  An emotionally-beaten man with his young daugh...       0          0   
1  Jack Sadelstein, a successful advertising exec...       0          0   
2  A dashing Mississippi river gambler wins the a...       0          0   
3  In feudal China, a blacksmith who makes weapon...       1          0   
4  It is the mid-1980s. From out of the sky, Sovi...       1          0   

   Comedy  Drama  Romance  Science Fiction  Crime  Horror  Fantasy  Animation  
0       0      1        1                0      0       0        0          0  
1       1      0        0                0      0       0        0          0  
2       0      0        1                0      0       0        0          0  
3       0      0        

## Create data

In [3]:
x = np.array(movies_df[bert_feats])
y = np.array(movies_df[genres_to_predict])

x = torch.tensor(x, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)


## Question 1:

Split data X, Y into training and test data.

Further split training data into D1 and D2, where D1 is used to train predictive model and D2 is used to calibrate prediction set.

In [None]:
ntrain = 3000
n1 = 2500

x_train = x[:ntrain]
x_test = x[ntrain:]

y_train = y[:ntrain]
y_test = y[ntrain:]

x_train1 = [INSERT CODE]
y_train1 = [INSERT CODE]

x_train2 = [INSERT CODE]
y_train2 = [INSERT CODE]

## Question 2: 
Create a feed-forward neural network to predict movie genres

In [None]:
## NN should have 2 hidden layers, with dimension hidden_dims[0] and hidden_dims[1]
## Use ReLU as the activation function for the hidden layers
## Use Sigmoid as the activation function for the output layer
class MovieNet(nn.Module):
    def __init__(self, input_dim, hidden_dims, out_dim):
        super(MovieNet, self).__init__()
        self.layers = nn.Sequential(
            [INSERT CODE]
        )

    def forward(self, x):
        return self.layers(x)


## Question 3:
Train the neural network to predict movie genres

In [None]:
## train neural network to predict genres
train_data = [INSERT CODE]

train_loader = DataLoader(train_data, batch_size=20, shuffle=True)

epochs = 200
hidden_dims = [32, 16]

model = MovieNet( [INSERT CODE] )
lr = 0.001
optimizer = optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    for x_batch, y_batch in train_loader:
        y_pred = model(x_batch)
        loss = F.binary_cross_entropy(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    if epoch % 50 == 0:
        print('Epoch %d, loss %.4f' % (epoch, loss.item()))


## Question 4:

Recall we have 10 genres.

`outcome` is a 10-dimensional binary list or tuple.

`probs` is a 10-dimensional list of probability of each genre.

You will complete the following functions:
* `computeScore` compute non-conformity score of an outcome, based on predicted probabilities
* `computeConformalSet` computes the prediction set


In [None]:

## you may find this function helpful
def computeProbability(sequence, probs):
    return(np.prod([probs[i] if sequence[i] == 1 else 1 - probs[i] for i in range(len(sequence))]))


def computeScore(my_outcome, probs):
    [INSERT CODE]



def computeConformalSet(probs, scores, alpha):
    n2 = len(scores)
    threshold = [INSERT CODE]

    all_outcomes = list(itertools.product([0,1], repeat=len(probs)))
    sorted_outcomes = sorted(all_outcomes, key=lambda x: computeProbability(x, probs), reverse=True)
    
    sorted_probs = [computeProbability(outcome, probs) for outcome in sorted_outcomes]
    cumulative_probs = [INSERT CODE]

    conformal_ix = np.where(cumulative_probs > threshold)[0][0]
    conformal_prediction_set = [INSERT CODE]

    return(conformal_prediction_set)


## used to more easily display the conformal prediction set
def confGenreSet(conf_set, genres_to_predict):
    return( [ [genres_to_predict[i] for i in range(len(conf_set[k])) 
        if conf_set[k][i] == 1] for k in range(len(conf_set)) ] )


## Question 5: 
Compute the non-conformity scores on D2.

In [None]:

y_pred2 = model(x_train2)
y_pred2 = y_pred2.detach().numpy()

scores = np.zeros(ntrain-n1)
for j in range(ntrain-n1):
    scores[j] = [INSERT CODE]


In [None]:
alpha = 0.2

print("Adjusted threshold: ", np.quantile(scores, 1-alpha))

## Question 6:

Check whether our conformal prediction set has the desired coverage guarantee on test data.

In [None]:

ntest = 200
covered = 0
for i in range(ntest):
    
    cur_y_pred = model(x_test[i])
    cur_y_pred = cur_y_pred.detach().numpy()

    conf_set = [INSERT CODE]

    y_test_tuple = tuple(y_test[i].int().tolist())
    
    if (y_test_tuple in conf_set):
        covered = covered + 1
    else:
        conf_genres = confGenreSet(conf_set, genres_to_predict)

        ## display mistake
        print(movies_df.iloc[i+ntest]['title'])
        print(movies_df.iloc[i+ntest]['overview'])
        print('Actual genres: ', [genres_to_predict[j] for j in range(len(genres_to_predict)) if y_test_tuple[j] == 1])
        print('Prediction set: ', conf_genres)
        print('')

print("(1-alpha): %.3f    Percent covered: %.3f" % (1-alpha, covered/ntest))

## Extra:

Try generating the prediction sets for each of the following made-up movie descriptions.



In [None]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')

my_movie_collection = [
"Orphaned fox befriends a lost magical bird, together they embark on a quest, discovering true family, courage, and enchanted realms",
"In a galaxy ruled by machines, a rebellious pilot and a rogue android uncover a cosmic secret, igniting an interstellar chase, alliances, betrayals, and the fate of sentient life.",
"In post-war Paris, an estranged violinist encounters a mute child prodigy, together navigating trauma and rediscovery, their melodies weaving stories of loss, love, and the power of connection",
"Exploring abandoned asylums, a documentary crew uncovers chilling histories intertwined with the supernatural. As they delve deeper, the line between reality and nightmare becomes hauntingly blurred, ensnaring them forever.",
"An uptight lawyer's world turns upside down when a free-spirited barista accidentally receives his trial notes. Amidst coffee spills and court blunders, they find love in the most unexpected verdict",
]

for my_movie in my_movie_collection:

    my_movie_tokens = tokenizer(my_movie, return_tensors='pt', 
                                max_length=128, padding='max_length', truncation=True)

    with torch.no_grad():
        my_movie_embed = bert(**my_movie_tokens)[1].detach().numpy()[0]
        
    my_pred = model(torch.tensor(my_movie_embed))
    my_pred = my_pred.detach().numpy()

    conf_set = computeConformalSet(my_pred, scores, alpha)
    genre_set = confGenreSet(conf_set, genres_to_predict)

    mydf = pd.DataFrame({'genres' : genres_to_predict, 'probs' : my_pred})

    print(my_movie)

    print("Output probabilities:")
    print(mydf)

    print("Prediction set:")
    for guess in genre_set:
        print(guess)
    print("")
