In [None]:
# Cell 1 — ensure Python can see your project root & “files” folder
import os, sys

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
files_dir    = os.path.join(project_root, 'files')

sys.path.insert(0, project_root)
sys.path.insert(0, files_dir)

import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from models import (
    db,
    User, Year, Director, Studio, Genre,
    Producer, CastMember, Movie, Favorite, Rating
)

# configure engine/session
DATABASE_URI = (
  "postgresql://doadmin:AVNS_UnBm0Ppb2AFhY6g8nGI"
  "@db-postgresql-nyc3-03675-do-user-21639042-0.m.db.ondigitalocean.com"
  ":25060/movies_db"
  "?sslmode=require"
  "&sslrootcert=./do_ca.crt"
)
# DATABASE_URI = 'postgresql://tolubai:password@localhost:5432/movies_db'
engine       = create_engine(DATABASE_URI)
Session      = sessionmaker(bind=engine)
session      = Session()

print("Imports and DB session ready")

Imports and DB session ready


In [2]:
db.metadata.drop_all(engine)
db.metadata.create_all(engine)
print("Dropped & recreated all tables")

Dropped & recreated all tables


In [3]:
# Cell 3 — Load your CSV and helper
df = pd.read_csv('/Users/tolubai/Desktop/csci_final_project/datasets/movies_with_embeddings.csv')

# If the same movie appears twice in the CSV, remove extras:
df = df.drop_duplicates(subset=['Title'], keep='first').reset_index(drop=True)

def clean_and_split(cell):
    if pd.isna(cell):
        return []
    return [x.strip() for x in cell.split(',') if x.strip()]

print(f"Loaded {len(df)} rows from CSV")

Loaded 9499 rows from CSV


In [4]:
# Cell 4 — Populate lookup tables
print("Populating lookup tables…")

# Years
for y in sorted(df['Year'].dropna().unique()):
    session.add(Year(year_value=int(y)))

# Directors
for d in sorted(df['Director'].dropna().unique()):
    session.add(Director(director_name=d))

# Studios
all_studios = set()
df['Studios'].dropna().apply(lambda x: all_studios.update(clean_and_split(x)))
for s in sorted(all_studios):
    session.add(Studio(studio_name=s))

# Genres
all_genres = set()
df['Genre'].dropna().apply(lambda x: all_genres.update(clean_and_split(x)))
for g in sorted(all_genres):
    session.add(Genre(genre_name=g))

# Producers
all_producers = set()
df['Producers'].dropna().apply(lambda x: all_producers.update(clean_and_split(x)))
for p in sorted(all_producers):
    session.add(Producer(producer_name=p))

# Cast members
all_cast = set()
df['Cast'].dropna().apply(lambda x: all_cast.update(clean_and_split(x)))
for c in sorted(all_cast):
    session.add(CastMember(cast_name=c))

session.commit()
print("Lookup tables populated")

Populating lookup tables…
Lookup tables populated


In [5]:
# Cell 5 — Build in-Python maps for quick FK assignment
year_map     = {yr.year_value: yr     for yr in session.query(Year).all()}
director_map = {d.director_name: d    for d  in session.query(Director).all()}
studio_map   = {s.studio_name: s      for s  in session.query(Studio).all()}
genre_map    = {g.genre_name: g       for g  in session.query(Genre).all()}
producer_map = {p.producer_name: p     for p  in session.query(Producer).all()}
cast_map     = {c.cast_name: c         for c  in session.query(CastMember).all()}

print("Mapping dicts built:", 
      len(year_map), "years,", 
      len(director_map), "directors,", 
      len(studio_map), "studios, etc.")

Mapping dicts built: 130 years, 4786 directors, 10272 studios, etc.


In [6]:
# Cell 6 — Insert Movies + set up all relationships (deduped)
print("Populating movies and link‐tables…")

for _, row in df.iterrows():
    m = Movie(
        title       = row['Title'],
        description = row['Description'],
        avg_rating  = float(row['AvgRating']) if pd.notna(row['AvgRating']) else None,
        duration    = int(row['Duration'])     if pd.notna(row['Duration']) else None,
        poster_url  = row['Poster URL'],
        page_url    = row['Page URL'],
        embeddings  = row['embeddings_minilm']
    )

    # one→many
    if pd.notna(row['Year']):
        m.year     = year_map.get(int(row['Year']))
    if pd.notna(row['Director']):
        m.director = director_map.get(row['Director'])

    # many↔many — wrap each clean_and_split(...) in set() to remove duplicates
    for s in set(clean_and_split(row['Studios'])):
        m.studios.append(studio_map[s])
    for g in set(clean_and_split(row['Genre'])):
        m.genres.append(genre_map[g])
    for p in set(clean_and_split(row['Producers'])):
        m.producers.append(producer_map[p])
    for c in set(clean_and_split(row['Cast'])):
        m.cast_members.append(cast_map[c])

    session.add(m)

session.commit()
print("Movies + associations populated (duplicates removed)")

Populating movies and link‐tables…
Movies + associations populated (duplicates removed)


In [7]:
# Cell 7 — Seed a default admin user
from werkzeug.security import generate_password_hash

admin = User(
    email     = 'admin@movieapp.com',
    password  = generate_password_hash('password'),
    is_admin  = True
)
session.merge(admin)   # upsert by email PK
session.commit()
print("Default admin user created (if not already present)")

Default admin user created (if not already present)


In [8]:
# Cell 8 — Quick counts to verify
from sqlalchemy import text

print("Final counts:")
print(" Movies:         ", session.query(Movie).count())
print(" Studios links:  ", session.execute(text("SELECT COUNT(*) FROM movie_studios")).scalar())
print(" Genres links:   ", session.execute(text("SELECT COUNT(*) FROM movie_genres")).scalar())
print(" Producers link: ", session.execute(text("SELECT COUNT(*) FROM movie_producers")).scalar())
print(" Cast links:     ", session.execute(text("SELECT COUNT(*) FROM movie_cast")).scalar())
print(" Users:          ", session.query(User).count())
print(" Favorites:      ", session.execute(text("SELECT COUNT(*) FROM favorites")).scalar())
print(" Ratings:        ", session.execute(text("SELECT COUNT(*) FROM ratings")).scalar())

Final counts:
 Movies:          9499
 Studios links:   29328
 Genres links:    23509
 Producers link:  32970
 Cast links:      324799
 Users:           1
 Favorites:       0
 Ratings:         0


In [None]:
# It will be needed later, after creating while creating functionality fro Admin
# CREATE TABLE activity_log (
#   id         SERIAL PRIMARY KEY,
#   user_id    INTEGER NOT NULL REFERENCES users(user_id),
#   action     VARCHAR(50) NOT NULL,
#   detail     JSONB,
#   created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT now()
# );

####################################################################################################################

# ALTER TABLE users ADD COLUMN active BOOLEAN;
# UPDATE users           SET active = TRUE;
# ALTER TABLE users
#   ALTER COLUMN active SET NOT NULL,
#   ALTER COLUMN active SET DEFAULT TRUE;
  
########################################################## OR ##########################################################  
  
# ALTER TABLE users
#   DROP COLUMN IF EXISTS active;

# ALTER TABLE users
#   ADD COLUMN IF NOT EXISTS active
#     BOOLEAN NOT NULL
#     DEFAULT TRUE;