import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import sklearn
import mlflow
import mlflow.pyfunc
import warnings
warnings.filterwarnings('ignore')

# Build NN Pipeline Algo

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
import pickle

In [2]:
# Specify data directory

data_dir = os.path.join(os.path.dirname(os.getcwd()),'data')
model_dir = os.path.join(os.path.dirname(os.getcwd()),'model')

In [3]:
imdb_file_path = os.path.join(data_dir, 'imbd_amazon_movie_vectors.csv')

In [4]:
final_filter_df_path = os.path.join(data_dir, 'final_df.csv')

In [5]:
df = pd.read_csv(imdb_file_path)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4304 entries, 0 to 4303
Data columns (total 30 columns):
Unnamed: 0                    4304 non-null int64
title_origin_imbd             4304 non-null object
rating                        3897 non-null object
year                          4304 non-null float64
users_rating                  4304 non-null float64
votes                         4304 non-null object
metascore                     2122 non-null float64
img_url                       4243 non-null object
countries                     4304 non-null object
languages                     4304 non-null object
actors                        4304 non-null object
genre                         4304 non-null object
tagline                       3528 non-null object
description                   4242 non-null object
directors                     4295 non-null object
runtime                       3905 non-null object
imdb_url                      4304 non-null object
votes_int               

In [6]:
df[['title_clean','users_rating', 'img_url', 'year', 'genre', 'description', 'title_origin_imbd', 'title_origin_amazon']].to_csv(final_filter_df_path)

In [7]:
col_keep = ['title_clean', 'users_rating', 'deep_profound', 'entertaining_music', 'realistic_settings', "experience_excitement", 'fun']

In [8]:
df = df[col_keep]

print(df.shape)
df.head()

(4304, 7)


Unnamed: 0,title_clean,users_rating,deep_profound,entertaining_music,realistic_settings,experience_excitement,fun
0,For Love of the Game,6.6,0.0,0.0,0.0,0.0,0.0
1,The Lord of the Rings The Return of the King,8.9,0.0,0.0,0.0,1.0,0.0
2,Her,8.0,0.0,0.0,0.0,1.0,0.0
3,Harry Potter and the Order of the Phoenix,7.5,0.0,1.0,0.0,1.0,0.0
4,Heat,8.2,0.0,0.0,0.0,0.0,0.0


In [9]:
X = df.drop(['title_clean', 'users_rating'], axis = 1, errors = 'ignore')

In [10]:
steps = [('scaler', StandardScaler()), 
         ('knn', NearestNeighbors(n_neighbors = 10,
                                  algorithm = 'ball_tree'))]

In [11]:
pipeline = Pipeline(steps)

In [12]:
pipeline.fit(X)

Pipeline(steps=[('scaler', StandardScaler()),
                ('knn',
                 NearestNeighbors(algorithm='ball_tree', n_neighbors=10))])

In [13]:
pipeline['scaler']

StandardScaler()

In [14]:
query = [1,3,3,4,5]
query = np.array(query).reshape(1, -1)
query_scaled = pipeline['scaler'].transform(query.reshape(1, -1))

In [15]:
dist, ind = pipeline['knn'].kneighbors(query_scaled)

In [16]:
ind

array([[2946,  658, 3503,  129,  808, 3629, 3451, 1850, 4258,  259]],
      dtype=int64)

In [17]:
filtered_recommendations = df.iloc[np.array(ind[0]),:]

In [18]:
filtered_recommendations

Unnamed: 0,title_clean,users_rating,deep_profound,entertaining_music,realistic_settings,experience_excitement,fun
2946,Take Her She s Mine,6.4,0.0,2.0,2.0,2.0,2.0
658,No Such Thing,6.1,0.0,3.0,2.0,1.0,2.0
3503,Elektra,4.7,0.0,1.0,1.0,3.0,2.0
129,Fun with Dick and Jane,6.2,1.0,1.0,1.0,1.0,4.0
808,The Hebrew Hammer,6.1,1.0,1.0,1.0,1.0,4.0
3629,Tower Heist,6.2,1.0,2.0,0.0,2.0,2.0
3451,Be Cool,5.6,0.0,1.0,1.0,2.0,2.0
1850,The Lost Skeleton of Cadavra,7.0,0.0,1.0,0.0,2.0,4.0
4258,School of Rock,7.1,0.0,2.0,1.0,1.0,2.0
259,Forces of Nature,5.4,0.0,2.0,1.0,1.0,2.0


In [19]:
filename = 'finalized_model.sav'
pickle.dump(pipeline, open(os.path.join(model_dir, filename), 'wb'))

# Build mlflow pipeline

import mlflow
import mlflow.pyfunc
import mlflow.sklearn
import warnings
warnings.filterwarnings('ignore')

mlflow_sklearn_path = os.path.join(model_dir, "filter_deploy")

conda_env = mlflow.sklearn.get_default_conda_env()

artifacts = {
    "final_df": final_filter_df_path}

class ModelWrapper(mlflow.pyfunc.PythonModel):

    # Load in the model and all required artifacts
    # The context object is provided by the MLflow framework
    # It will contain all of the artifacts specified above
    def load_context(self, context):
        import os, pickle
        import numpy as np
        import pandas as pd
        
        # Initialize model
        self.model = pickle.load(open('model.pkl', 'rb'))
        
        # Load in and deserialize the label encoder object
        self.df = df.read_csv(context.artifacts["final_df.csv"])
                         
    # Create a predict function for our models
    def predict(self, context, model_input):
        
        query = np.array(model_input).reshape(1, -1)
        query_scaled = self.model['scaler'].transform(query.reshape(1, -1))
        _, ind = self.model['knn'].kneighbors(query_scaled)
        filtered_recommendations = df.iloc[np.array(indices[0]),:]
        
        return filtered_recommendations

# Package the model!
mlflow.pyfunc.save_model(path=mlflow_sklearn_path,
                         python_model=ModelWrapper(),
                         artifacts=artifacts,
                         conda_env=conda_env)