In [1]:
import pandas as pd

df = pd.read_csv('imdb_clean_1990.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [2]:
df.shape

(54054, 11)

In [3]:
df.head(10)

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,directors,writers,actors,isAdult,averageRating,numVotes
0,tt0035423,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance",nm0003506,"nm0737216,nm0003506","nm0413168,nm0000630,nm0005227",0,6.4,76677
1,tt0066498,The Ear,1990,94,"Drama,Thriller",nm0434189,"nm0434189,nm0698311,nm0935156","nm0117505,nm0649121,nm0392705",0,7.8,2306
2,tt0069049,The Other Side of the Wind,2018,122,Drama,nm0000080,"nm0000080,nm0462648","nm0001379,nm0000953",0,6.8,5469
3,tt0072670,Attila 74: The Rape of Cyprus,1995,103,Documentary,nm0128050,nm0128050,nm0128050,0,7.0,237
4,tt0081145,Me and the Kid,1993,94,"Comedy,Crime,Drama",nm0193303,"nm0169785,nm0849601","nm0000732,nm0958406,nm0001592",0,5.4,214
5,tt0081721,Vincent and Me,1990,100,"Drama,Family,Fantasy",nm0747808,nm0747808,"nm0001409,nm0286612,nm0459325",0,6.3,283
6,tt0084870,Memories and Confessions,1993,73,"Biography,History",nm0210701,"nm0078760,nm0210701",nm0246822,0,7.3,193
7,tt0088751,The Naked Monster,2005,100,"Comedy,Horror,Sci-Fi","nm0628399,nm0078540",nm0628399,"nm0864851,nm0933983,nm0329491",0,5.5,250
8,tt0090665,Halfaouine: Boy of the Terraces,1990,98,"Comedy,Drama",nm0099276,"nm0099276,nm0100525,nm0420017,nm0502507","nm0099277,nm0012399,nm0238091",0,6.7,1155
9,tt0091899,The Scarlet Scorpion,1990,90,Comedy,nm0136642,nm0524521,"nm0135629,nm0537345",0,6.2,123


#### create keywords

In [5]:
df['keywords'] = df['genres'] + ',' + df['directors'] + ',' + df['writers'] + ',' + df['actors']

#### Vectorize the features

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
count_matrix = count.fit_transform(df['keywords'])

In [7]:
count_matrix.shape

(54054, 116682)

#### Cosine Similarity

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)

### Content Based Prediction

In [9]:
df = df.reset_index()
titles = df['primaryTitle']
indices = pd.Series(df.index, index=df['primaryTitle'])

In [10]:
df

Unnamed: 0,index,tconst,primaryTitle,startYear,runtimeMinutes,genres,directors,writers,actors,isAdult,averageRating,numVotes,keywords
0,0,tt0035423,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance",nm0003506,"nm0737216,nm0003506","nm0413168,nm0000630,nm0005227",0,6.4,76677,"Comedy,Fantasy,Romance,nm0003506,nm0737216,nm0..."
1,1,tt0066498,The Ear,1990,94,"Drama,Thriller",nm0434189,"nm0434189,nm0698311,nm0935156","nm0117505,nm0649121,nm0392705",0,7.8,2306,"Drama,Thriller,nm0434189,nm0434189,nm0698311,n..."
2,2,tt0069049,The Other Side of the Wind,2018,122,Drama,nm0000080,"nm0000080,nm0462648","nm0001379,nm0000953",0,6.8,5469,"Drama,nm0000080,nm0000080,nm0462648,nm0001379,..."
3,3,tt0072670,Attila 74: The Rape of Cyprus,1995,103,Documentary,nm0128050,nm0128050,nm0128050,0,7.0,237,"Documentary,nm0128050,nm0128050,nm0128050"
4,4,tt0081145,Me and the Kid,1993,94,"Comedy,Crime,Drama",nm0193303,"nm0169785,nm0849601","nm0000732,nm0958406,nm0001592",0,5.4,214,"Comedy,Crime,Drama,nm0193303,nm0169785,nm08496..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
54049,54049,tt9905462,Pengalila,2019,111,Drama,nm0151535,nm0151535,"nm0482309,nm1230844",0,8.8,550,"Drama,nm0151535,nm0151535,nm0482309,nm1230844"
54050,54050,tt9908390,Le lion,2020,95,Comedy,nm1415268,"nm1597648,nm1597688","nm0200702,nm1081573,nm0431221",0,4.6,141,"Comedy,nm1415268,nm1597648,nm1597688,nm0200702..."
54051,54051,tt9911196,De Beentjes van Sint-Hildegard,2020,103,"Comedy,Drama",nm0631590,nm0277932,"nm0277932,nm10877188",0,7.8,514,"Comedy,Drama,nm0631590,nm0277932,nm0277932,nm1..."
54052,54052,tt9911774,Padmavyuhathile Abhimanyu,2019,130,Drama,nm10536451,"nm10536451,nm10536453,nm10536454","nm2649680,nm1428724",0,8.0,263,"Drama,nm10536451,nm10536451,nm10536453,nm10536..."


In [4]:
def weighted_rating(x,m,C):
    v = x['numVotes']
    R = x['averageRating']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [11]:
def recommendation(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, reverse=True)
    sim_scores = sim_scores[1:51]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = df.iloc[movie_indices][['primaryTitle', 'numVotes', 'averageRating']]
    vote_counts = movies[movies['numVotes'].notnull()]['numVotes'].astype('int')
    vote_averages = movies[movies['averageRating'].notnull()]['averageRating'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['numVotes'] >= m) & (movies['numVotes'].notnull()) & (movies['averageRating'].notnull())]
    qualified['numVotes'] = qualified['numVotes'].astype('int')
    qualified['averageRating'] = qualified['averageRating'].astype('int')
    qualified['wr'] = weighted_rating(qualified,m,C)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [12]:
predict = recommendation('A Million Little Pieces')
print("-------------------------------------------")
print("Movie Name: A Million Little Pieces")
print(predict)
print("predicting the scores.....")
predict_score = sum(predict['wr'])/len(predict['wr'])
print("predicted score: ",predict_score)

-------------------------------------------
Movie Name: A Million Little Pieces
                            primaryTitle  numVotes  averageRating        wr
54046                             Kaithi      6545              8  7.882121
54036                   Munthiri Monchan       955              8  7.376096
54049                          Pengalila       550              8  7.094463
54003                           Just 6.5      4639              7  6.906693
54020                 End of the Century       905              7  6.628456
54051     De Beentjes van Sint-Hildegard       514              7  6.459768
54048                              Ottam       485              7  6.440942
54040  Upin & Ipin: The Lone Gibbon Kris       414              7  6.388797
54033                             Malang      2191              6  5.953491
54042                                VFW      2164              6  5.952991
predicting the scores.....
predicted score:  6.708381699283194


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
