<a href="https://colab.research.google.com/github/xMigulito/Trabalho-de-IA/blob/main/Trabalho_de_IA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# CÉLULA 1 – Instalar tudo que precisa (roda em 15 segundos)
!pip install -q gdown xgboost python-pptx pandas scikit-learn

```markdown
# Trabalho Final – Inteligência Artificial
### Análise de Vídeos do YouTube
Professor: Luan Oliveira

In [None]:
import kagglehub
import os
import pandas as pd

path = kagglehub.dataset_download("grandmaster07/youtube-posting-dataset")
print("Dataset baixado em:", path)

arquivos = [f for f in os.listdir(path) if f.endswith('.csv')]
csv_path = os.path.join(path, arquivos[0])

print("Carregando o arquivo:", csv_path)
df = pd.read_csv(csv_path, low_memory=False)

print(f"Dataset carregado com sucesso!")
print(f"→ {df.shape[0]:,} linhas × {df.shape[1]} colunas")
df.head(3)

Downloading from https://www.kaggle.com/api/v1/datasets/download/grandmaster07/youtube-posting-dataset?dataset_version_number=1...


100%|██████████| 76.8k/76.8k [00:00<00:00, 51.3MB/s]

Extracting files...
Dataset baixado em: /root/.cache/kagglehub/datasets/grandmaster07/youtube-posting-dataset/versions/1
Carregando o arquivo: /root/.cache/kagglehub/datasets/grandmaster07/youtube-posting-dataset/versions/1/youtube-top-100-songs-2025.csv
Dataset carregado com sucesso!
→ 100 linhas × 13 colunas





Unnamed: 0,title,fulltitle,description,view_count,categories,tags,duration,duration_string,live_status,thumbnail,channel,channel_url,channel_follower_count
0,ROSÉ & Bruno Mars - APT. (Official Music Video),ROSÉ & Bruno Mars - APT. (Official Music Video),ROSÉ & Bruno Mars - APT.\nDownload/stream: ht...,2009014557,Music,YG Entertainment;YG;와이지;K-pop;BLACKPINK;블랙핑크;블...,173,2:53,False,https://i.ytimg.com/vi_webp/ekr2nIex040/maxres...,ROSÉ,https://www.youtube.com/channel/UCBo1hnzxV9rz3...,19200000
1,"Lady Gaga, Bruno Mars - Die With A Smile (Offi...","Lady Gaga, Bruno Mars - Die With A Smile (Offi...",MAYHEM OUT NOW\nhttp://ladygaga.com \n \nListe...,1324833300,Music,Lady Gaga;Bruno Mars;Interscope;Pop,252,4:12,False,https://i.ytimg.com/vi/kPa7bsKwL-c/maxresdefau...,Lady Gaga,https://www.youtube.com/channel/UC07Kxew-cMIay...,29600000
2,Reneé Rapp - Leave Me Alone (Official Music Vi...,Reneé Rapp - Leave Me Alone (Official Music Vi...,"Listen to “BITE ME”, the new album from Reneé ...",2536628,Music,Reneé Rapp;Interscope Records;Pop,160,2:40,False,https://i.ytimg.com/vi/tiPWzFLiz4A/maxresdefau...,Reneé Rapp,https://www.youtube.com/channel/UCZy4ki_L4bzw9...,408000


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

print("Colunas reais do seu dataset:")
print(df.columns.tolist())
print(f"\nTotal de linhas: {len(df)}")
df.head(3)

Colunas reais do seu dataset:
['view_count', 'categories', 'duration', 'duration_string', 'live_status', 'channel_follower_count', 'log_views', 'follower_ratio', 'views_per_second', 'category_id', 'is_live']

Total de linhas: 100


Unnamed: 0,view_count,categories,duration,duration_string,live_status,channel_follower_count,log_views,follower_ratio,views_per_second,category_id,is_live
0,2009014557,Music,173,2:53,False,19200000,21.42091,104.636169,11546060.0,0,0
1,1324833300,Music,252,4:12,False,29600000,21.004552,44.75788,5236495.0,0,0
2,2536628,Music,160,2:40,False,408000,14.746347,6.21721,15755.45,0,0


In [None]:
# Limpeza básica
df = df.drop(columns=['fulltitle', 'thumbnail', 'channel', 'channel_url'], errors='ignore')

# Garantir tipos numéricos
df['view_count'] = pd.to_numeric(df['view_count'], errors='coerce')
df['duration'] = pd.to_numeric(df['duration'], errors='coerce')
df['channel_follower_count'] = pd.to_numeric(df['channel_follower_count'], errors='coerce')

# Criar features poderosas com o que temos
df['log_views'] = np.log1p(df['view_count'])
df['views_per_second'] = df['view_count'] / (df['duration'] + 1)
df['views_per_follower'] = df['view_count'] / (df['channel_follower_count'] + 1)
df['is_live'] = (df['live_status'].astype(str).str.lower() == 'live').astype(int)

# Target: categories → category_id
le = LabelEncoder()
df['category_id'] = le.fit_transform(df['categories'].astype(str))

# Remover linhas com valores críticos faltando
df = df.dropna(subset=['view_count', 'duration', 'channel_follower_count', 'category_id'])

print(f"\nPré-processamento concluído!")
print(f"Dataset final: {df.shape[0]:,} linhas")
print(f"Categorias encontradas ({len(le.classes_)}): {list(le.classes_)[:10]}...")
print("Features criadas: log_views, views_per_second, views_per_follower, is_live")


Pré-processamento concluído!
Dataset final: 100 linhas
Categorias encontradas (2): ['Music', 'People & Blogs']...
Features criadas: log_views, views_per_second, views_per_follower, is_live


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

features = ['view_count', 'duration', 'channel_follower_count',
            'log_views', 'views_per_second', 'views_per_follower', 'is_live']

X = df[features].fillna(0)
y = df['category_id']

# Divisão
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Modelo 1: Random Forest
rf = RandomForestClassifier(n_estimators=300, max_depth=20, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
acc_rf = accuracy_score(y_test, rf.predict(X_test))

# Modelo 2: XGBoost
xgb = XGBClassifier(n_estimators=300, max_depth=8, learning_rate=0.1,
                    random_state=42, n_jobs=-1, eval_metric='mlogloss')
xgb.fit(X_train, y_train)
acc_xgb = accuracy_score(y_test, xgb.predict(X_test))

# Detecção de anomalias (vídeos com métricas impossíveis)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.sample(frac=0.4, random_state=42))
iso = IsolationForest(contamination=0.02, random_state=42, n_jobs=-1)
iso_pred = iso.fit_predict(X_scaled)
anomalias = sum(iso_pred == -1)

print(f"\nRESULTADOS FINAIS")
print(f"Random Forest → Acurácia: {acc_rf:.4f}")
print(f"XGBoost       → Acurácia: {acc_xgb:.4f} ← {'MELHOR' if acc_xgb > acc_rf else 'Perdeu'}")
print(f"Isolation Forest → {anomalias} vídeos anômalos detectados (ex: views explosivas em canal pequeno)")


RESULTADOS FINAIS
Random Forest → Acurácia: 0.9500
XGBoost       → Acurácia: 0.9500 ← Perdeu
Isolation Forest → 1 vídeos anômalos detectados (ex: views explosivas em canal pequeno)
