In [None]:
# save_data.py
import pandas as pd
import logging
import ast
import yaml
import pickle
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# ---------- 1. Load context ----------
with open("config.yaml", "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

DATA_DIR   = Path(cfg["data_dir"])
MOVIES_CSV = DATA_DIR / cfg["movies_file"]
CREDITS_CSV = DATA_DIR / cfg["credits_file"]

# ---------- 2. Logging ----------
logging.basicConfig(
    level=logging.INFO,
    format="%(levelname)s: %(message)s"
)

# ---------- 3. Load & merge ----------
movies  = pd.read_csv(MOVIES_CSV)
credits = pd.read_csv(CREDITS_CSV)
logging.warning("Movies & Credits CSV loaded")

movies = movies.merge(credits, on="title")
movies = movies[["id", "title", "overview", "genres", "keywords", "cast", "crew"]]
movies = movies.dropna()
logging.warning("Merged & cleaned")

# ---------- 4. Helper parsers ----------
def convert(text):          return [i["name"] for i in ast.literal_eval(text)]
def convert_cast(text):     return [i["name"] for i in ast.literal_eval(text)[:5]]
def get_director(text):     return [i["name"] for i in ast.literal_eval(text) if i["job"] == "Director"]
def clean_names(lst):       return [i.replace(" ", "") for i in lst]

# ---------- 5. Build tags ----------
movies["genres"]   = movies["genres"].apply(convert).apply(clean_names)
movies["keywords"] = movies["keywords"].apply(convert).apply(clean_names)
movies["cast"]     = movies["cast"].apply(convert_cast).apply(clean_names)
movies["crew"]     = movies["crew"].apply(get_director).apply(clean_names)
movies["overview"] = movies["overview"].apply(lambda x: x.split())

movies["tags"] = (
    movies["overview"] + movies["genres"] + movies["keywords"] +
    movies["cast"] + movies["crew"]
)
movies["tags"] = movies["tags"].apply(lambda x: " ".join(x))

new = movies[["id", "title", "tags"]].copy()
new.loc[:, "tags"] = new["tags"].apply(lambda x: x.lower())
logging.warning("Tags built")

# ---------- 6. Vectorise ----------
cv = CountVectorizer(max_features=5000, stop_words="english")
vector = cv.fit_transform(new["tags"]).toarray()
similarity = cosine_similarity(vector)
logging.warning("Similarity matrix ready")

# ---------- 7. Persist ----------
pickle.dump(new,        open("movies_ta.pkl",    "wb"))
pickle.dump(similarity, open("similarity_ta.pkl","wb"))
logging.warning("Pickles saved → movies_ta.pkl  &  similarity_ta.pkl")




Top 5 movies similar to 'Avatar':

Titan A.E.
Independence Day
Aliens vs Predator: Requiem
Small Soldiers
Battle: Los Angeles

Top 5 movies similar to 'Batman v Superman: Dawn of Justice':

Man of Steel
Batman Begins
Batman
Teenage Mutant Ninja Turtles
Defendor


In [3]:
import pickle
pickle.dump(new, open("movies_ta.pkl", "wb"))
pickle.dump(similarity, open("similarity_ta.pkl", "wb"))