In [1]:
import pandas as pd, numpy as np, difflib,spacy
from sklearn.feature_extraction.text import (
    TfidfVectorizer,
    HashingVectorizer,
)
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from collections import Counter
from ast import literal_eval
import chunkdot.cosine_similarity_top_k as cstk

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 200)


In [72]:
md=pd.read_csv('./movies_final.csv')
md.head(2)
md.columns
md.shape


Unnamed: 0,id,url,Name,PosterLink,Genres,Actors,Director,Description,DatePublished,Keywords,RatingCount,BestRating,WorstRating,RatingValue,ReviewAurthor,ReviewDate,ReviewBody,duration
0,0,https://www.imdb.com/title/tt2221420/,Sallie Gardner at a Gallop,https://m.media-amazon.com/images/M/MV5BMjk0Mz...,"Documentary,Short","Gilbert Domm,Sallie Gardner",Eadweard Muybridge,Sallie Gardner at a Gallop is a short starring...,1878-06-15,"19th century,1870s,nature,horse,horse riding",2465,10.0,1.0,7.4,Cineanalyst,2013-11-12,"Sometimes ascribed as ""The Father of the Motio...",1M
1,1,https://www.imdb.com/title/tt5459794/,Buffalo Running,https://m.media-amazon.com/images/M/MV5BMTFkM2...,"Documentary,Short",,Eadweard Muybridge,,1883-11-19,"1880s,19th century,buffalo running,american bi...",766,10.0,1.0,6.4,framptonhollis,2017-08-30,It's pretty powerful to see the earliest works...,1M


Index(['id', 'url', 'Name', 'PosterLink', 'Genres', 'Actors', 'Director',
       'Description', 'DatePublished', 'Keywords', 'RatingCount', 'BestRating',
       'WorstRating', 'RatingValue', 'ReviewAurthor', 'ReviewDate',
       'ReviewBody', 'duration'],
      dtype='object')

(48513, 18)

In [73]:
print(md['Genres'].isna().sum())
print(md['Description'].isna().sum())
print(md['Director'].isna().sum())
print(md['Actors'].isna().sum())
print(md['Keywords'].isna().sum())
print(md['Name'].isna().sum())


25
1287
635
740
2141
0


In [74]:
md.drop(['duration','id','BestRating','WorstRating'],axis=1,inplace=True)
md.rename({'PosterLink':'Poster','ReviewAurthor':'ReviewAuthor'},axis=1,inplace=True)
md.head(2)


Unnamed: 0,url,Name,Poster,Genres,Actors,Director,Description,DatePublished,Keywords,RatingCount,RatingValue,ReviewAuthor,ReviewDate,ReviewBody
0,https://www.imdb.com/title/tt2221420/,Sallie Gardner at a Gallop,https://m.media-amazon.com/images/M/MV5BMjk0Mz...,"Documentary,Short","Gilbert Domm,Sallie Gardner",Eadweard Muybridge,Sallie Gardner at a Gallop is a short starring...,1878-06-15,"19th century,1870s,nature,horse,horse riding",2465,7.4,Cineanalyst,2013-11-12,"Sometimes ascribed as ""The Father of the Motio..."
1,https://www.imdb.com/title/tt5459794/,Buffalo Running,https://m.media-amazon.com/images/M/MV5BMTFkM2...,"Documentary,Short",,Eadweard Muybridge,,1883-11-19,"1880s,19th century,buffalo running,american bi...",766,6.4,framptonhollis,2017-08-30,It's pretty powerful to see the earliest works...


In [75]:
feats=['Genres','Name','Keywords','Actors','Director','Description']

for i in feats:
  md[i].fillna('',inplace=True)


In [76]:
nlp=spacy.load('en_core_web_sm')
lemma_text_list=[]
for doc in nlp.pipe(md["Description"]):
    lemma_text_list.append(" ".join(token.lemma_ for token in doc))
md["DescriptionLemma"] = lemma_text_list


In [77]:
md['RatingValue'].isna().sum()
md['RatingCount'].isna().sum()
md['RatingValue'].unique()
md['RatingCount'].unique()


0

0

array([ 7.4,  6.4,  5.2,  6.7,  7.3,  5.5,  5.8,  5. ,  4.8,  4.4,  5.3,
        4.7,  4.9,  5.1,  6.5,  3.9,  6.2,  5.7,  5.4,  5.9,  4.3,  4.1,
        5.6,  4.5,  6.9,  7.1,  6. ,  6.3,  6.1,  7.5,  7. ,  8.2,  6.8,
        6.6,  2.9,  7.6,  7.2,  7.7,  7.8,  8.1,  8.3,  7.9,  8. , -1. ,
        8.4,  8.5,  3.7,  4.6,  3.6,  3.3,  4. ,  3.8,  2.7,  3.4,  2.5,
        8.6,  2. ,  2.2,  3.2,  1.6,  3. ,  4.2,  3.1,  2.1,  2.4,  2.3,
        9. ,  2.8,  3.5,  8.8,  1.7,  1.8,  8.9,  1.9,  2.6,  9.3,  9.2,
        8.7,  1.4,  1.3,  9.1,  9.4,  1.5,  1.1,  9.5])

array([ 2465,   766,  1140, ..., 18223, 65466, 35822], dtype=int64)

In [78]:
md=md[md['RatingCount'].astype('int32')>0].reset_index(drop=True)

def weighted_rating(x):
    v,avg = x["RatingCount"],x["RatingValue"]
    return (v * avg +  md["RatingValue"].mean()) / (1 + v)

md["votewt"] = md.apply(weighted_rating, axis=1)
md.sort_values("votewt", ascending=False,inplace=True,kind='mergesort',ignore_index=True)
md.shape


(48477, 16)

In [79]:
md['Genres']=md['Genres'].apply(lambda x:[i for i in x.split(',')])
md['Actors']=md['Actors'].apply(lambda x:[i for i in x.split(',')])
md['Keywords']=md['Keywords'].apply(lambda x:[i for i in x.split(',')])

md.to_feather('./movies_final.feather', compression="lz4",compression_level=9,)


In [2]:
md=pd.read_feather('./movies_final.feather')
feats=['Genres','Actors','Keywords','Name','Director','DescriptionLemma']


In [81]:
combined_feats=''
combined_feats+=md['Genres'].apply(lambda x:' '.join(i for i in x)+' ')
combined_feats+=md['Keywords'].apply(lambda x:' '.join(i for i in x)+' ')*6
# combined_feats+=md['Genres'].apply(lambda x:' '.join(i.replace(' ','') for i in x)+' ')*2
# combined_feats+=md['Keywords'].apply(lambda x:' '.join(i.replace(' ','') for i in x)+' ')*4
combined_feats+=md['Actors'].apply(lambda x:' '.join(i.replace(' ','') for i in x)+' ')
combined_feats+=md['Name'].apply(lambda x:x.replace(' ','')+' ')*2
combined_feats+=md['Director'].apply(lambda x:x.replace(' ','')+' ')*2
combined_feats+=(md['DescriptionLemma']+' ')*8
combined_feats


0        Documentary nature documentary mountain desert...
1        Comedy based on board game based on board game...
2        Action Drama History War airborne troops ameri...
3        Documentary animal life earth tv mini series n...
4        Adventure Drama Family Mystery Sci-Fi gallifre...
                               ...                        
48472    Comedy Horror Mystery Thriller slasher slasher...
48473    Music concert surrealism rock music band perfo...
48474    Comedy Horror school school school school scho...
48475    Comedy remake remake remake remake remake rema...
48476    Action Drama running running running running r...
Name: Genres, Length: 48477, dtype: object

In [82]:
combined_vector = TfidfVectorizer(
    analyzer="word",
    stop_words="english",
    strip_accents="unicode",
    dtype=np.float32,
).fit_transform(combined_feats)


In [83]:
combined_sim=cstk(combined_vector, top_k=21)
combined_sim


<48477x48477 sparse matrix of type '<class 'numpy.float32'>'
	with 1018017 stored elements in Compressed Sparse Row format>

In [84]:
indices = {title:[idx,poster] for  title,idx,poster in zip(md["Name"],md.index,md['Poster'])}

def get_recommendations(title, combined_sim=combined_sim):
    idx = indices[title][0]
    cs = combined_sim[idx]
    sorted_data = sorted(zip(cs.data, cs.indices), key=lambda x: x[0], reverse=True)[1:]
    ans_data = [i[1] for i in sorted_data]
    return md["Name"][ans_data]


In [85]:
def closest_match(title):
    return difflib.get_close_matches(title, indices.keys(), n=1)[0]

get_recommendations(closest_match('batman'))


1620               Batman: The Dark Knight Returns, Part 1
775                                          Batman Begins
31170    Lego DC Comics Superheroes: Justice League - G...
29802                                  Batman vs. Two-Face
1870                                         Batman Beyond
22716                                       Miyori no mori
6703                                      Batman: Year One
42837                                         Jungle Woman
393                Batman: The Dark Knight Returns, Part 2
366                                  The Dark Knight Rises
5358                                                Batman
13269                                       Batman Returns
37216                                Behind the Rising Sun
2028                                                Elijah
17152                                Batman: Gotham Knight
2802                    Batman Beyond: Return of the Joker
22010                  Lego DC Comics: Batman Be-Leaguer

In [86]:
md.head(2)


Unnamed: 0,url,Name,Poster,Genres,Actors,Director,Description,DatePublished,Keywords,RatingCount,RatingValue,ReviewAuthor,ReviewDate,ReviewBody,DescriptionLemma,votewt
0,https://www.imdb.com/title/tt5491994/,Planet Earth II,https://m.media-amazon.com/images/M/MV5BZWYxOD...,[Documentary],[David Attenborough],,Planet Earth II is a TV mini-series starring D...,2016-11-06,"[nature documentary, mountain, desert, island,...",95049,9.5,NeilBarnett,2016-11-14,This is without doubt the best thing I have se...,Planet Earth II be a tv mini - series star Dav...,9.499966
1,https://www.imdb.com/title/tt6290024/,Uno: The Movie,https://m.media-amazon.com/images/M/MV5BN2YyNm...,[Comedy],"[Geoff Ramsey, Gavin Free, Ryan Haywood, Jerem...",,Uno: The Movie is a video starring Geoff Ramse...,2016-11-30,[based on board game],10099,9.5,sorrowscythe,2016-12-03,When I heard about this I thought it will be t...,"uno : the Movie be a video star Geoff Ramsey ,...",9.499685


In [87]:
genre_counts = Counter([genre for genres_list in md['Genres'] for genre in genres_list if genre!=''])
genre_counts={genre: count for genre, count in genre_counts.items() if count >= 100}
genre_counts


{'Documentary': 4637,
 'Comedy': 16033,
 'Action': 6993,
 'Drama': 24837,
 'History': 2175,
 'War': 2160,
 'Adventure': 4974,
 'Family': 3753,
 'Mystery': 3850,
 'Sci-Fi': 3781,
 'Music': 1804,
 'Crime': 6734,
 'Thriller': 9421,
 'Sport': 1133,
 'Musical': 1463,
 'Animation': 2501,
 'Fantasy': 3765,
 'Biography': 2476,
 'Horror': 5384,
 'Romance': 9091,
 'Western': 1178,
 'News': 167,
 'Short': 1942,
 'Film-Noir': 495}

In [88]:

def get_genre_movies(genre):
    if genre=='All':
        return md[['Name','votewt']]
    return md[md['genres'].astype('str').str.contains(genre)][['Name','votewt']]


In [38]:
x=list(md[['Name','Poster']].iterrows())
print(x[0][1]['Poster'])


https://m.media-amazon.com/images/M/MV5BZWYxODViMGYtMGE2ZC00ZGQ3LThhMWUtYTVkNGE3OWU4NWRkL2ltYWdlL2ltYWdlXkEyXkFqcGdeQXVyMjYwNDA2MDE@._V1_.jpg
