In [1]:
import pandas as pd
import os

import spacy 
import subprocess
from string import punctuation

from rake_nltk import Rake

In [2]:
def rake_implement(x,r):
    r.extract_keywords_from_text(x)
    return r.get_ranked_phrases()

In [3]:
def get_hotwords(nlp, text):
    result = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN'] # 1
    doc = nlp(text.lower()) # 2
    for token in doc:
        # 3
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        # 4
        if(token.pos_ in pos_tag):
            result.append(token.text)

    return result # 5

In [4]:
dir_path = os.path.dirname(os.path.realpath("__file__"))
df = pd.read_csv(dir_path + "/IMDb movies.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df.description = df.description.astype(str)

In [6]:
df_nltk = df.copy()
r = Rake()
df_nltk["key_words"] = df_nltk["description"].apply(lambda x: rake_implement(x,r))

In [7]:
df_nltk.key_words.head()

0                 [female reporter, adventures, 1890s]
1    [notorious australian outlaw ned kelly, true s...
2    [accepts count von waldberg, famous equestrian...
3    [roman general marc antony, ultimately disastr...
4    [original silent film, tangerine dream, new sc...
Name: key_words, dtype: object

In [8]:
temp = df_nltk.director.value_counts() 

In [9]:
temp[temp > 5].shape

(3047,)

### Distribution based on year

In [10]:
exploded = pd.concat([pd.Series(row['title'],  row['genre'].split(', ')) for _, row in df_nltk.iterrows()]).reset_index()

In [35]:
exploded.columns = ["genre", "title"]
df_nltk = df_nltk.drop(["genre"], axis=1)
df_genre = exploded.merge(df_nltk, left_on="title", right_on="title")

In [36]:
df_genre['year'] = df_genre['year'].str.extract('(\d+)', expand=False)
df_genre = df_genre.dropna(subset=["year"])
df_genre.year = df_genre.year.astype(int)

In [52]:
new = df_genre[df_genre.year >= 2010]
med = df_genre[(df_genre.year < 2010) & (df_genre.year >= 1980)]
old = df_genre[(df_genre.year < 1980) ]

In [53]:
df_genre["rounded_vote"] = df_genre["avg_vote"].round()

In [54]:
new.shape

(44717, 24)

In [55]:
med.shape

(365, 24)

In [56]:
old.shape

(79, 24)

In [65]:
new_dict = new.genre.value_counts(normalize=True).to_dict()
med_dict = med.genre.value_counts(normalize=True).to_dict()
old_dict = old.genre.value_counts(normalize=True).to_dict()

In [66]:
genre_dict = {
    "new":new_dict,
    "med":med_dict,
    "old":old_dict
}

In [67]:
new_dict = new.rounded_vote.value_counts(normalize=True).to_dict()
med_dict = med.rounded_vote.value_counts(normalize=True).to_dict()
old_dict = old.rounded_vote.value_counts(normalize=True).to_dict()

In [68]:
vote_dict = {
    "new":new_dict,
    "med":med_dict,
    "old":old_dict
}

In [69]:
dataset = {
    "vote_distribution":vote_dict,
    "genre_distribution": genre_dict
    
}

In [70]:
import json 

with open(dir_path + "/" + "imdb_distribution.json", "w") as file:
    file.write(json.dumps(dataset, indent=4, sort_keys=True))

### Ranking based on vote

In [None]:
high = df_nltk[df_nltk.avg_vote >= 7]
med = df_nltk[(df_nltk.avg_vote < 7) & (df_nltk.avg_vote >= 5)]
low = df_nltk[(df_nltk.avg_vote < 5) ]

In [None]:
high.genre.value_counts()

In [None]:
med.director.value_counts().to_dict()