In [None]:
import numpy as np
import pandas as pd
import ast
import nltk

In [None]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [None]:
movies.head(1)

In [None]:
credits.head(1)['cast'].values

In [None]:
movies = movies.merge(credits,on='title')

In [None]:
movies.shape

In [None]:
credits.shape

In [None]:
movies.head(1)

In [None]:
movies = movies[['movie_id', 'genres', 'keywords', 'title', 'cast', 'crew', 'overview' ]]
# budget
# genres
# id
# keywords
# orignal_title
# overview
# runtime
# tagline
# cast
# crew


# now we need to merge the genres, cast ,crew column to form a tags column
# we will take only top three casts and only the director from the crew.

# this is the part of data preprocessing






In [None]:
movies


In [None]:
movies.isnull().sum()

In [None]:
movies.duplicated().sum()

In [None]:
movies.iloc[0].genres

In [None]:
# '[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

# we have to convert this format in , 
#  ['action', 'adventure', 'fantasy', 'science fiction']

In [None]:
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [None]:
convert( '[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'
)

In [None]:
movies['genres'].apply(convert)

In [None]:
movies['genres'] = movies['genres'].apply(convert)
movies.head()


In [None]:
movies.iloc[0].keywords

In [None]:
movies['keywords'] = movies['keywords'].apply(convert)

In [None]:
movies

In [None]:
movies['cast'][0]

In [None]:
def convert3(obj):
    cnt =0
    L = []
    for i in ast.literal_eval(obj):
        if cnt <= 3:
          L.append(i['name'])
          cnt++1
        else : 
          break
    return L

In [None]:
movies['cast'] = movies['cast'].apply(convert3)

In [None]:
movies['crew'][3]



In [None]:
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
          L.append(i['name'])
          break
    return L

In [None]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [None]:
movies

In [None]:
movies['overview'][0]
# this overview section is a string, so we need to convert it in a list so that we can concatenate it with other lists.


In [None]:
# this converts text in list

nltk.download('punkt')
from nltk.tokenize import word_tokenize
movies = movies.dropna(subset=['overview'])

movies['overview'] = movies['overview'].apply(word_tokenize)


In [None]:
# this is a check mark

for index, tags in enumerate(movies['overview']):
    if not isinstance(tags, list):
        print(f"Element at index {index} is not a list. Type: {type(tags)}")
    else:
        print(f"Element at index {index} is a list. Type: {type(tags)}")
        

In [None]:
movies['overview'][0]

In [None]:
# we need to remove spaces between the same name 
movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])

In [None]:
movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])

In [None]:
movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])

In [None]:
movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])



In [None]:
movies['tags'] = movies['overview'] + movies['keywords'] + movies['cast'] + movies['crew'] + movies['genres']

In [None]:
movies.head()

In [None]:
new_df = movies[['movie_id', 'title', 'tags']]

In [None]:
new_df

In [None]:
# we need to convert the list into string format again " ".join(x) for i in x]


new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

# if u run above statement multiple times it will add a space that number of time


In [None]:
new_df['tags'][0]


In [None]:
new_df.head()

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())
# this is recommended to convert all text into lower case by this function

In [None]:
# now we have a fine clear data, now we have to vectorize the data
'''
VECTORIZATION

this is the process of converting text into vectors. then the vectors closest to a particular vector will be the best match for that particular vector.
                      Vectorisation
                       /    |     \
                      /     |      \
                     /      |       \
            bag of words   
'''

In [None]:
# now there are some words like (acted, acting, actors, actor) ... these all are similar so we need to replace this by one
# this process is called stamming

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [None]:
new_df['tags'] = new_df['tags'].apply(stem)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features= 5000, stop_words='english')

In [None]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [None]:
cv.get_feature_names()


In [None]:
vectors

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vectors)

In [None]:
# new_df['title'] == 'Avatar' # this checks the index of movie in data
movie_index = new_df[new_df['title'] == 'Avatar'].index[0]
movie_index

In [None]:
'''
1. enumerate is like map, it convert the list into tuple to store the index.
2. Reverse is to arrange in ascending order
3. [1:6] return the top five matching movies and x[1] sort on the basis of 1st index
'''
# sorted(list(enumerate(similarity[0])), reverse=True, key = lambda x:x[1:6])

In [None]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]    # is the distance array of movies from movie at this index
    movies_list = sorted(list(enumerate(distances)), reverse=True, key = lambda x:x[1])[1:6]

    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [None]:
recommend('1982')

In [None]:
# new_df.iloc[1216][1] #actually here index 1 is 'title' of the movie

In [None]:
import pickle

In [None]:
movies_list = pickle.dump(new_df,open('movies.pkl', 'wb'))  # wb stands for write binary mode

In [None]:
movies = new_df['title']  # this will fetch the name of all movies
movies

In [None]:
pickle.dump(similarity,open('similarity.pkl', 'wb'))

In [1]:
# Loading raw german data
german_df = pd.read_csv("europarl-v7.de-en.de", "utf-8", header=None, names=["German"])
german_df.head()

NameError: name 'pd' is not defined