Installing Necessary Pacakges for this notebook

In [61]:
! pip install pandas sklearn nltk



Loading the csv files

In [62]:
import pandas as pd

In [63]:
credits = pd.read_csv("data/tmdb_5000_credits.csv")
movies = pd.read_csv("data/tmdb_5000_movies.csv")

In [64]:
movies = movies.merge(credits, on="title")

### Data Preprocessing 
1. Feature Selection (Dimentionality Reduction)

In [65]:
# genres
# id
# keywords
# title
# overview
# cast
# crew

movies = movies[["id", "title", "overview", "genres", "keywords", "cast", "crew"]]

2. Feature cleaning

In [66]:
movies.dropna(inplace=True)

3. Checking for Duplicate values

In [67]:
movies.duplicated().sum()

0

4. Applying function on features to convert them to lists

In [68]:
import ast # Abstract Syntax Tree


def convert_to_list(column: str) -> list:
    return [item['name'] for item in ast.literal_eval(column)]

In [69]:
movies['genres'] = movies['genres'].apply(convert_to_list)

In [70]:
movies['keywords'] = movies['keywords'].apply(convert_to_list)

In [71]:
movies['cast'] = movies['cast'].apply(lambda casts: convert_to_list(casts)[: 3])

In [72]:
movies['crew'] = movies['crew'].apply(lambda crews: [crew['name'] for crew in ast.literal_eval(crews) if crew['job'] == 'Director'])

In [73]:
movies['overview'] = movies['overview'].apply(lambda text: text.split())

In [74]:
def concatenate_words(strings: list[str]) -> list[str]:
    return [string.replace(" ", "") for string in strings]


In [75]:
movies['genres'] = movies['genres'].apply(concatenate_words)
movies['keywords'] = movies['keywords'].apply(concatenate_words)
movies['cast'] = movies['cast'].apply(concatenate_words)
movies['crew'] = movies['crew'].apply(concatenate_words)

5. New Feature Creation

In [76]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [77]:
new_df = pd.DataFrame(movies[['id', 'title', 'tags']])

In [78]:
print(new_df)

          id                                     title  \
0      19995                                    Avatar   
1        285  Pirates of the Caribbean: At World's End   
2     206647                                   Spectre   
3      49026                     The Dark Knight Rises   
4      49529                               John Carter   
...      ...                                       ...   
4804    9367                               El Mariachi   
4805   72766                                 Newlyweds   
4806  231617                 Signed, Sealed, Delivered   
4807  126186                          Shanghai Calling   
4808   25975                         My Date with Drew   

                                                   tags  
0     [In, the, 22nd, century,, a, paraplegic, Marin...  
1     [Captain, Barbossa,, long, believed, to, be, d...  
2     [A, cryptic, message, from, Bond’s, past, send...  
3     [Following, the, death, of, District, Attorney...  
4     [John, 

In [79]:
new_df['tags'] = new_df['tags'].apply(lambda lst: ' '.join(map(str.lower, lst)))

6. Apply stemming on features

In [80]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

In [81]:
# with stemming
new_df['tags'] = new_df['tags'].apply(lambda tags: ' '.join(ps.stem(tag) for tag in tags.split()))

In [82]:
new_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."


In [83]:
from sklearn.feature_extraction.text import CountVectorizer


cv = CountVectorizer(max_features=10000, stop_words='english')

In [84]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [85]:
cv.get_feature_names_out()[:500]

array(['00', '000', '007', '10', '100', '10th', '11', '12', '12th', '13',
       '14', '15', '150', '15th', '16', '16th', '17', '17th', '18',
       '1863', '1890', '18th', '18thcenturi', '19', '1910', '1920',
       '1927', '1930', '1930s', '1937', '1940', '1940s', '1941', '1944',
       '1945', '1950', '1950s', '1955', '1959', '1960', '1960s', '1962',
       '1964', '1965', '1967', '1969', '1970', '1970s', '1971', '1972',
       '1973', '1974', '1976', '1977', '1979', '1980', '1980s', '1984',
       '1985', '1986', '1987', '1990', '1994', '1995', '1996', '1997',
       '1999', '19th', '19thcenturi', '20', '200', '2000', '2001', '2002',
       '2003', '2004', '2007', '2008', '2009', '2011', '2012', '20th',
       '21st', '21stcenturi', '22nd', '23', '24', '25', '27', '28', '29',
       '30', '300', '35', '3d', '40', '400', '47', '50', '500', '51',
       '60', '60s', '70', '7th', '80', 'aaron', 'aaroneckhart',
       'aaronseltz', 'aarontaylor', 'abandon', 'abbi', 'abbiecornish',
    

7. Calculating using cosine similarity instead of Euclidean Distance

In [86]:
from sklearn.metrics.pairwise import cosine_similarity

In [87]:
similarity = cosine_similarity(vectors)

In [88]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies = enumerate(distances)
    recommended_movies = sorted(movies, reverse=True, key=lambda m: m[1])
    top_5_recommendations = recommended_movies[1:6]

    return [new_df.iloc[index].title for index, _ in top_5_recommendations]

8. Testing on a data

In [89]:
recommend('Avatar')

['Aliens vs Predator: Requiem',
 'Aliens',
 'Falcon Rising',
 'Titan A.E.',
 'Independence Day']

9. Exporting new Datasets


In [90]:
import pickle


pickle.dump(new_df, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))