In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
data=pd.read_csv('tags.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,movie_id,title,clean_tags
0,0,19995,Avatar,nd century paraplegic marine dispatched moon p...
1,1,285,Pirates of the Caribbean: At World's End,captain barbossa long believed dead come back ...
2,2,206647,Spectre,cryptic message bond past sends trail uncover ...
3,3,49026,The Dark Knight Rises,following death district attorney harvey dent ...
4,4,49529,John Carter,john carter warweary former military captain w...


In [4]:
data=data.drop('Unnamed: 0', axis=1)

In [5]:
vectorizer= TfidfVectorizer(use_idf=True, strip_accents='ascii' )

In [6]:
vectors= vectorizer.fit_transform(data.clean_tags).toarray()

In [7]:
vectors.shape

(4806, 39084)

In [8]:
vectorizer.get_feature_names()

['aa',
 'aaa',
 'aalch',
 'aaliyah',
 'aames',
 'aamirkhan',
 'aang',
 'aaranthomas',
 'aaron',
 'aaronabrams',
 'aaroneckhart',
 'aaronhann',
 'aaronkwok',
 'aaronmurphy',
 'aaronpaul',
 'aaronruell',
 'aarons',
 'aaronschneider',
 'aaronseltzer',
 'aaronstanford',
 'aaront',
 'aarontaylorjohnson',
 'aaronyoo',
 'aasheekaabathija',
 'aasifmandvi',
 'aba',
 'abaddon',
 'abagnale',
 'abandon',
 'abandoned',
 'abandonedhouse',
 'abandonedmine',
 'abandoning',
 'abandonment',
 'abandons',
 'abba',
 'abbas',
 'abbate',
 'abberline',
 'abbey',
 'abbeylincoln',
 'abbie',
 'abbiecornish',
 'abbott',
 'abby',
 'abdicates',
 'abdication',
 'abduct',
 'abducted',
 'abduction',
 'abductions',
 'abductors',
 'abducts',
 'abe',
 'abel',
 'abelferrara',
 'aberdeen',
 'abernal',
 'abernathy',
 'abhinayvaddi',
 'abhishekbachchan',
 'abhorrent',
 'abhors',
 'abide',
 'abigail',
 'abigailbianca',
 'abigailbreslin',
 'abigailhargrove',
 'abigailspencer',
 'abilities',
 'ability',
 'ablai',
 'able',
 'abl

In [9]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [10]:
def lemat(text):
    y=[]
    
    for i in text.split():
        y.append(lemmatizer.lemmatize(i))
    return " ".join(y)

In [11]:
#nltk.download('wordnet')

In [12]:
data['clean_tags']=data['clean_tags'].apply(lemat)

In [13]:
data.head()

Unnamed: 0,movie_id,title,clean_tags
0,19995,Avatar,nd century paraplegic marine dispatched moon p...
1,285,Pirates of the Caribbean: At World's End,captain barbossa long believed dead come back ...
2,206647,Spectre,cryptic message bond past sends trail uncover ...
3,49026,The Dark Knight Rises,following death district attorney harvey dent ...
4,49529,John Carter,john carter warweary former military captain w...


In [14]:
vectorizer= TfidfVectorizer(use_idf=True, strip_accents='ascii',stop_words='english')
vectors= vectorizer.fit_transform(data.clean_tags).toarray()

In [15]:
vectorizer.get_feature_names()

['aa',
 'aaa',
 'aalch',
 'aaliyah',
 'aames',
 'aamirkhan',
 'aang',
 'aaranthomas',
 'aaron',
 'aaronabrams',
 'aaroneckhart',
 'aaronhann',
 'aaronkwok',
 'aaronmurphy',
 'aaronpaul',
 'aaronruell',
 'aaronschneider',
 'aaronseltzer',
 'aaronstanford',
 'aaront',
 'aarontaylorjohnson',
 'aaronyoo',
 'aasheekaabathija',
 'aasifmandvi',
 'aba',
 'abaddon',
 'abagnale',
 'abandon',
 'abandoned',
 'abandonedhouse',
 'abandonedmine',
 'abandoning',
 'abandonment',
 'abba',
 'abbas',
 'abbate',
 'abberline',
 'abbey',
 'abbeylincoln',
 'abbie',
 'abbiecornish',
 'abbott',
 'abby',
 'abdicates',
 'abdication',
 'abduct',
 'abducted',
 'abduction',
 'abductor',
 'abducts',
 'abe',
 'abel',
 'abelferrara',
 'aberdeen',
 'abernal',
 'abernathy',
 'abhinayvaddi',
 'abhishekbachchan',
 'abhorrent',
 'abhors',
 'abide',
 'abigail',
 'abigailbianca',
 'abigailbreslin',
 'abigailhargrove',
 'abigailspencer',
 'ability',
 'ablai',
 'able',
 'ableman',
 'aboard',
 'abode',
 'abolish',
 'abolition',


In [16]:
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
similarity=cosine_similarity(vectors)

In [36]:
similarity = (65535*similarity).round().astype("uint16")

In [37]:
similarity[0]

array([65535,   938,  1271, ...,   873,   227,     0], dtype=uint16)

In [38]:
def recommend(movie):
    movie_index= data[data['title']==movie].index[0]
    distances= similarity[movie_index]
    movies_list=sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    for i in movies_list:
        print (data.iloc[i[0]].title)
        

In [39]:
recommend('The Dark Knight Rises')

The Dark Knight
Batman Returns
Batman Begins
Batman Forever
Batman


In [40]:
recommend('John Carter')

Mission to Mars
Get Carter
Ghosts of Mars
Red Planet
The Marine 4: Moving Target


In [41]:
import pickle
pickle.dump(data, open('movies.pkl', 'wb'))

In [42]:
pickle.dump(similarity1, open('similarity.pkl', 'wb'))

In [25]:
 pickle.dump(data.to_dict(), open('movies_dict.pkl', 'wb'))