<a href="https://colab.research.google.com/github/thaisja/projeto-aplicado-netflix/blob/main/01_NETFLIX_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# %%
!pip -q install pyarrow fastparquet tqdm

import os, gc, math, random, sys, textwrap, numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

pd.set_option("display.max_columns", 50)
pd.set_option("display.float_format", lambda x: f"{x:,.4f}")
sns.set()
SEED = 42
random.seed(SEED); np.random.seed(SEED)



In [2]:
from google.colab import drive
drive.mount('/content/drive')

DATA_DIR = "/content/drive/MyDrive/MACKENZIE/4S Ciência de Dados"

FILES = [
    f"{DATA_DIR}/combined_data_1.txt",
    f"{DATA_DIR}/combined_data_2.txt",
    f"{DATA_DIR}/combined_data_3.txt",
    f"{DATA_DIR}/combined_data_4.txt"
]
import os
for f in FILES:
    print(f, "->", os.path.exists(f))



Mounted at /content/drive
/content/drive/MyDrive/MACKENZIE/4S Ciência de Dados/combined_data_1.txt -> True
/content/drive/MyDrive/MACKENZIE/4S Ciência de Dados/combined_data_2.txt -> True
/content/drive/MyDrive/MACKENZIE/4S Ciência de Dados/combined_data_3.txt -> True
/content/drive/MyDrive/MACKENZIE/4S Ciência de Dados/combined_data_4.txt -> True


In [None]:
!pip -q install pyarrow fastparquet tqdm

import random, pandas as pd, numpy as np
from tqdm import tqdm
import gc

SAMPLE_RATE = 0.002
MAX_ROWS_PER_FILE = None
SEED = 42
random.seed(SEED)

CACHE_PARQUET = "/content/netflix_all_sample.parquet"
CACHE_PARQUET_DRIVE = f"{DATA_DIR}/netflix_all_sample.parquet"

def iter_netflix_file(path, sample_rate=1.0, max_rows=None, seed=SEED):
    rng = random.Random(seed)
    movie_id = None
    out = 0
    with open(path, "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.endswith(":"):
                movie_id = int(line[:-1])
                continue
            try:
                user_id, rating, dt = line.split(",")
                if max_rows is not None and out >= max_rows:
                    break
                if sample_rate < 1.0 and rng.random() > sample_rate:
                    continue
                yield (int(user_id), movie_id, float(rating), dt)
                out += 1
            except:
                continue

def load_all(files, sample_rate=SAMPLE_RATE, max_rows_per_file=MAX_ROWS_PER_FILE):
    rows = []
    for p in files:
        print("Lendo:", p)
        for tup in tqdm(iter_netflix_file(p, sample_rate=sample_rate, max_rows=max_rows_per_file)):
            rows.append(tup)
    df = pd.DataFrame(rows, columns=["user_id","movie_id","rating","date"])
    df["user_id"]  = df["user_id"].astype("int32")
    df["movie_id"] = df["movie_id"].astype("int32")
    df["rating"]   = df["rating"].astype("float32")
    df["date"]     = pd.to_datetime(df["date"], format="%Y-%m-%d", errors="coerce")
    return df
try:
    import os
    if os.path.exists(CACHE_PARQUET):
        df = pd.read_parquet(CACHE_PARQUET)
        print("Carregado do cache local:", CACHE_PARQUET, df.shape)
    elif os.path.exists(CACHE_PARQUET_DRIVE):
        df = pd.read_parquet(CACHE_PARQUET_DRIVE)
        print("Carregado do cache no Drive:", CACHE_PARQUET_DRIVE, df.shape)
    else:
        df = load_all(FILES, sample_rate=SAMPLE_RATE, max_rows_per_file=MAX_ROWS_PER_FILE)
        df.to_parquet(CACHE_PARQUET, index=False)
        df.to_parquet(CACHE_PARQUET_DRIVE, index=False)
        print("Amostra combinada salva em:", CACHE_PARQUET, "e", CACHE_PARQUET_DRIVE, df.shape)
except Exception as e:
    print("Erro ao carregar/criar cache:", e)

df.head()


In [None]:
def memory_mb(df):
    return df.memory_usage(deep=True).sum() / (1024**2)

print("Shape:", df.shape, "| Memória (MB):", round(memory_mb(df), 2))
print("Período:", df["date"].min(), "->", df["date"].max())
print("\nAusentes:\n", df.isna().sum())
print("\nDuplicatas exatas:", df.duplicated().sum())


In [None]:
print(df["rating"].describe())

# distribuição (tabela)
rating_counts = df["rating"].value_counts().sort_index()
display(pd.DataFrame({"rating": rating_counts.index, "count": rating_counts.values,
                      "share": (rating_counts/len(df)).values}))

# gráfico
import matplotlib.pyplot as plt
plt.figure(figsize=(6,4))
df["rating"].value_counts().sort_index().plot(kind="bar")
plt.title("Distribuição de notas")
plt.xlabel("Nota"); plt.ylabel("Contagem")
plt.show()


In [None]:
user_activity = df.groupby("user_id", as_index=False).size().rename(columns={"size":"n_ratings_user"})
item_pop     = df.groupby("movie_id", as_index=False).size().rename(columns={"size":"n_ratings_item"})

print("Usuários únicos:", user_activity.shape[0], "| Itens únicos:", item_pop.shape[0])
display(user_activity["n_ratings_user"].describe())
display(item_pop["n_ratings_item"].describe())

plt.figure(figsize=(6,4))
plt.hist(user_activity["n_ratings_user"], bins=60)
plt.yscale("log"); plt.title("Avaliações por usuário (log-y)")
plt.xlabel("# avaliações por usuário"); plt.ylabel("freq (log)")
plt.show()

plt.figure(figsize=(6,4))
plt.hist(item_pop["n_ratings_item"], bins=60)
plt.yscale("log"); plt.title("Avaliações por item (log-y)")
plt.xlabel("# avaliações por item"); plt.ylabel("freq (log)")
plt.show()


In [None]:
n_obs = len(df)
n_users = user_activity.shape[0]
n_items = item_pop.shape[0]
density = n_obs / (n_users * n_items)
print(f"Interações: {n_obs:,} | Usuários: {n_users:,} | Itens: {n_items:,}")
print(f"Densidade (sparsidade): {density:.8f} -> matriz extremamente esparsa")


In [None]:
top_items = (df.groupby("movie_id")["rating"]
               .agg(n_ratings="count", mean_rating="mean")
               .sort_values("n_ratings", ascending=False)
               .head(15))
display(top_items)

top_users = (df.groupby("user_id")["rating"]
               .agg(n_ratings="count", mean_rating="mean")
               .sort_values("n_ratings", ascending=False)
               .head(15))
display(top_users)

MIN_RATINGS = 200
best_items = (df.groupby("movie_id")["rating"]
                .agg(n_ratings="count", mean_rating="mean")
                .query("n_ratings >= @MIN_RATINGS")
                .sort_values("mean_rating", ascending=False)
                .head(15))
display(best_items)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

df["month"] = df["date"].values.astype("datetime64[M]")

ts_month = (df.groupby("month", as_index=True)
              .size()
              .rename("n_ratings")
              .sort_index())

last_month = ts_month.index.max()
in_last = (df["month"] == last_month)
n_unique_days = df.loc[in_last, "date"].dt.day.nunique()
if n_unique_days < 25:
    ts_month = ts_month.iloc[:-1]

fig, ax = plt.subplots(figsize=(10,4))
ax.plot(ts_month.index, ts_month.values, linewidth=1.5, label="Mensal")

roll3 = ts_month.rolling(window=3, center=True).mean()
ax.plot(roll3.index, roll3.values, linewidth=2.0, label="Média móvel (3M)")

ax.xaxis.set_major_locator(mdates.YearLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
ax.xaxis.set_minor_locator(mdates.MonthLocator(bymonth=[1,4,7,10]))

ax.set_title("Avaliações por mês")
ax.set_xlabel("Ano")
ax.set_ylabel("# avaliações")
ax.grid(True, which="both", alpha=0.25)
ax.legend()
plt.tight_layout()
plt.show()

df["weekday"] = df["date"].dt.weekday
wk = df["weekday"].value_counts().sort_index()

plt.figure(figsize=(6,4))
plt.bar(wk.index, wk.values)
plt.title("Avaliações por dia da semana")
plt.xlabel("Dia (0=Seg … 6=Dom)")
plt.ylabel("Contagem")
plt.grid(True, axis="y", alpha=0.25)
plt.tight_layout()
plt.show()


In [None]:
tmp = (df.groupby("movie_id")["rating"]
         .agg(n_ratings="count", mean_rating="mean"))

plt.figure(figsize=(6,4))
plt.scatter(tmp["n_ratings"], tmp["mean_rating"], s=8, alpha=0.5)
plt.xscale("log")
plt.title("Média x #avaliações por item (log-x)")
plt.xlabel("# avaliações (item)"); plt.ylabel("média de nota")
plt.show()


In [None]:
u_cold = (user_activity["n_ratings_user"] <= 5).mean()
i_cold = (item_pop["n_ratings_item"] <= 5).mean()
print(f"% usuários com <=5 avaliações: {100*u_cold:.2f}%")
print(f"% itens com <=5 avaliações:   {100*i_cold:.2f}%")


In [None]:
before = len(df)
df = df.dropna(subset=["date"])
df = df.sort_values(["user_id","movie_id","date"])

dup_count = df.duplicated(subset=["user_id","movie_id","date","rating"]).sum()
df = df[~df.duplicated(subset=["user_id","movie_id","date","rating"])].reset_index(drop=True)
print("Removidas (NaT + duplicatas):", before - len(df), "| duplicatas:", dup_count)

MIN_U, MIN_I = 5, 5
u_keep = df["user_id"].map(df["user_id"].value_counts()) >= MIN_U
i_keep = df["movie_id"].map(df["movie_id"].value_counts()) >= MIN_I
df_small = df[u_keep & i_keep].reset_index(drop=True)

print("Shape total:", before, "-> após limpeza:", df.shape, "-> POC:", df_small.shape)

OUT_PARQUET = "/content/netflix_poc_clean.parquet"
OUT_PARQUET_DRIVE = f"{DATA_DIR}/netflix_poc_clean.parquet"
df_small.to_parquet(OUT_PARQUET, index=False)
df_small.to_parquet(OUT_PARQUET_DRIVE, index=False)
print("POC salva em:", OUT_PARQUET, "e", OUT_PARQUET_DRIVE)
gc.collect()


In [None]:
df_small = df_small.sort_values(["user_id","date"])
last_idx = df_small.groupby("user_id").tail(1).index
val = df_small.loc[last_idx]
train = df_small.drop(last_idx)

print("Train:", train.shape, "| Val:", val.shape)
print("Período Train:", train["date"].min(), "->", train["date"].max())
print("Período Val:",   val["date"].min(),   "->", val["date"].max())


In [None]:
# 1. Instalar dependências
!pip -q install scikit-surprise pyarrow fastparquet

# 2. Corrigir numpy
!pip uninstall -y numpy
!pip install "numpy<2"

# 3. Reiniciar runtime automaticamente
os.kill(os.getpid(), 9)


In [None]:
import os; os.kill(os.getpid(), 9)

In [None]:
import numpy, pandas
print("NumPy:", numpy.__version__)

from surprise import Dataset, Reader, SVD
print("Surprise import OK!")


In [None]:
# =========================
# POC: Sistema de Recomendação (SVD - filtragem colaborativa)

# Carregar bibliotecas
import os, math
import numpy as np
import pandas as pd
from surprise import Dataset, Reader, SVD
from sklearn.metrics import mean_squared_error, mean_absolute_error


# 1) Carregar parquet (base preparada na EDA)
PARQUET_PATH = "/content/netflix_poc_clean.parquet"
assert os.path.exists(PARQUET_PATH), "Preciso rodar a EDA antes para gerar este arquivo."

df = pd.read_parquet(PARQUET_PATH)[["user_id","movie_id","rating","date"]]
df["date"] = pd.to_datetime(df["date"])

# 2) Split leave-one-out por usuário
#    -> última avaliação de cada usuário fica para validação

df = df.sort_values(["user_id","date"])
val_idx = df.groupby("user_id").tail(1).index
val_df = df.loc[val_idx].copy()
train_df = df.drop(val_idx).copy()

print("Train:", train_df.shape, "| Val:", val_df.shape)

# 3) Treino com SVD
reader = Reader(rating_scale=(1,5))
train_data = Dataset.load_from_df(train_df[["user_id","movie_id","rating"]], reader)
trainset = train_data.build_full_trainset()

# parâmetros default já funcionam bem p/ POC
svd = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02, random_state=42)
svd.fit(trainset)

# 4) Avaliação na validação
# predição de cada par (user, movie) da validação
preds = [svd.predict(int(u), int(i)).est
         for u,i in val_df[["user_id","movie_id"]].itertuples(index=False, name=None)]

y_true = val_df["rating"].astype(float).values
y_hat  = np.array(preds)

rmse = math.sqrt(mean_squared_error(y_true, y_hat))
mae  = mean_absolute_error(y_true, y_hat)
print(f"SVD POC -> RMSE: {rmse:.4f} | MAE: {mae:.4f}")

# 5) Baseline simples (média global)
#    -> só p/ mostrar que o modelo melhora algo
global_mean = train_df["rating"].mean()
rmse_base = math.sqrt(mean_squared_error(y_true, np.full_like(y_true, global_mean)))
mae_base  = mean_absolute_error(y_true, np.full_like(y_true, global_mean))

print(f"Baseline (média) -> RMSE: {rmse_base:.4f} | MAE: {mae_base:.4f}")
print(f"Ganho vs baseline (ΔRMSE): {(rmse_base - rmse):.4f}")

# 6) Top-10 recomendações para um usuário qualquer (demonstração)
user_example = int(train_df["user_id"].iloc[0])  # pegar o primeiro só para testar
seen = set(train_df.loc[train_df["user_id"]==user_example, "movie_id"].unique())
all_items = set(train_df["movie_id"].unique())
candidates = list(all_items - seen)

# limitar a 5000 itens só para rodar rápido
candidates = candidates[:5000]

scores = [(iid, svd.predict(user_example, int(iid)).est) for iid in candidates]
top10 = sorted(scores, key=lambda x: x[1], reverse=True)[:10]

print("\nTop-10 itens recomendados (movie_id, score) para o usuário", user_example)
for iid, s in top10:
    print(iid, f"{s:.3f}")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# valores vindos da POC
rmse_svd, mae_svd = 1.0190, 0.8323
rmse_base, mae_base = 1.0776, 0.9019

metrics = ["RMSE", "MAE"]
baseline_vals = [rmse_base, mae_base]
svd_vals      = [rmse_svd, mae_svd]

x = np.arange(len(metrics))
width = 0.35

fig, ax = plt.subplots(figsize=(7,4))
rects1 = ax.bar(x - width/2, baseline_vals, width, label="Baseline (média)")
rects2 = ax.bar(x + width/2, svd_vals,      width, label="SVD (POC)")

ax.set_title("Figura 7 – Comparação de erro: Baseline vs. SVD (POC)")
ax.set_ylabel("Valor da métrica")
ax.set_xticks(x, metrics)
ax.legend()
ax.grid(axis="y", alpha=0.25)

# mostrar valores acima das barras
for rects in [rects1, rects2]:
    for r in rects:
        h = r.get_height()
        ax.annotate(f"{h:.4f}", xy=(r.get_x()+r.get_width()/2, h),
                    xytext=(0, 3), textcoords="offset points", ha="center", va="bottom")

plt.tight_layout()
plt.show()
