# Youtube Scaled Abuse Analysis Project

In [4]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import requests
from google.cloud import videointelligence
import os

# Data collection and preprocessing
Data Collection: Automates the process of gathering video data from YouTube, which can be used for analysis or machine learning projects.
<br>
Simulated Data: Adds extra columns to simulate real-world scenarios where videos might violate policies or receive user reports, useful for testing algorithms or models.

<ol> Construct a URL to call the YouTube API with your API Key </ol>
<ol> send the request to get the video Data</ol>
<ol> parse the response to video details like id, title, description, and publishedAt</ol>
<ol> store the data in list of dictionaries</ol>
<ol> Convert the list of video data into a Pandas DataFrame for easy manipulation and analysis.</ol>
<ol> Violation Type: Randomly assigns a type of policy violation (e.g., 'spam', 'hate speech') to each video. </ol>
<ol> User Reports: Randomly generates a number of user reports for each video.</ol>

In [5]:
def collect_video_data(api_key, max_results=1000):
    url = f"https://www.googleapis.com/youtube/v3/search?key={api_key}&part=snippet&type=video&maxResults={max_results}"
    response = requests.get(url)
    data = response.json()
    videos = []
    for item in data.get('items', []):
        video = {
            'id': item['id']['videoId'],
            'title': item['snippet']['title'],
            'description': item['snippet']['description'],
            'publishedAt': item['snippet']['publishedAt']
        }
        videos.append(video)
    return pd.DataFrame(videos)

In [7]:
# Simulating data collection
video_data = collect_video_data('AIzaSyDhHFhatbX3mUMu98yDfmHs43z0cbvH1_o')

# Add simulated policy violation data
video_data['violation_type'] = np.random.choice(['none', 'spam', 'hate_speech', 'nudity', 'violence'], size=len(video_data))
video_data['user_reports'] = np.random.randint(0, 100, size=len(video_data))

In [8]:
video_data.head()

Unnamed: 0,id,title,description,publishedAt,violation_type,user_reports
0,hbivC9ztGOE,🎥 EXAM (2009) | Full Movie Trailer in HD | 1080p,Eight candidates for a highly desirable corpor...,2018-06-02T12:00:03Z,none,22
1,VCqwS05SU4s,Millet Noodles 🍜 #proteinrichrecipe #proteinno...,,2024-09-10T05:07:45Z,none,14
2,cn27hLC0X90,Trump-Tulsi Vs Kamala Harris LIVE | Trump-Harr...,Trump-Tulsi Vs Kamala Harris LIVE | Trump-Harr...,2024-09-04T20:48:54Z,hate_speech,60
3,TRxKwivxmls,Use expiring milk to make Korean rice mask 🥛,,2024-09-10T00:55:15Z,nudity,28
4,Fd56fk3bMAw,"Knock-knock, What a car is at the door? Best S...","Knock-knock, What a car is at the door? Best S...",2024-04-17T17:18:02Z,nudity,48


# Exploratory Data Analysis

In [10]:
import matplotlib
%matplotlib inline

In [16]:
def perform_eda(df):
    plt.figure(figsize=(12, 6))
    sns.countplot(x='violation_type', data=df) # to understand which type of violation type is most common
    plt.title('Distribution of Policy Violations')
    plt.savefig('violation_distribution.png')
    plt.close()

    df['violation_type_encoded'] = df['violation_type'].astype('category').cat.codes
    correlation = df[['user_reports', 'violation_type_encoded']].corr() #shows the correlation between user_reports and violation_types
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation, annot=True, cmap='coolwarm')
    plt.title('Correlation between User Reports and Violations')
    plt.savefig('correlation_heatmap.png')
    plt.close()

perform_eda(video_data)
# -0.011 meaning: changes in one variable have little to no effect on the other.

# Feature Engineering

In [17]:
def extract_text_features(df):
    tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
    text_features = tfidf.fit_transform(df['description'])
    return pd.DataFrame(text_features.toarray(), columns=tfidf.get_feature_names_out())

def extract_visual_features(video_id):
    # Simulating visual feature extraction
    # In a real scenario, you would use the Video Intelligence API
    return np.random.rand(1, 100)  # 100 visual features

text_features = extract_text_features(video_data)
visual_features = pd.DataFrame(np.random.rand(len(video_data), 100), columns=[f'visual_feature_{i}' for i in range(100)])

# Combine all features
X = pd.concat([video_data[['user_reports']], text_features, visual_features], axis=1)
y = video_data['violation_type']

# Model Development

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline for preprocessing and model training
numeric_features = ['user_reports']
text_features = text_features.columns
visual_features = visual_features.columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('text', 'passthrough', text_features),
        ('visual', 'passthrough', visual_features)
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Model Evaluation and Optimization

In [19]:
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, 30, None],
    'classifier__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Best parameters:", grid_search.best_params_)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END classifier__max_depth=10, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.1s
[CV] END classifier__max_depth=10, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.1s
[CV] END classifier__max_depth=10, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.1s
[CV] END classifier__max_depth=10, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.1s
[CV] END classifier__max_depth=10, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.1s
[CV] END classifier__max_depth=10, classifier__min_samples_split=2, classifier__n_estimators=200; total time=   0.2s
[CV] END classifier__max_depth=10, classifier__min_samples_split=2, classifier__n_estimators=200; total time=   0.2s
[CV] END classifier__max_depth=10, classifier__min_samples_split=2, classifier__n_estimators=200; total time=   0.2s
[C

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Scalability and Performance Optimization

In [21]:
import dask.dataframe as dd

dask_df = dd.from_pandas(X, npartitions=4)
dask_predictions = dask_df.map_partitions(best_model.predict).compute()

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



# Incident Response System

In [22]:
def detect_anomalies(df, threshold=2):
    mean = df['user_reports'].mean()
    std = df['user_reports'].std()
    df['is_anomaly'] = df['user_reports'] > (mean + threshold * std)
    return df

anomalies = detect_anomalies(video_data)
print(f"Number of potential incidents detected: {anomalies['is_anomaly'].sum()}")

Number of potential incidents detected: 0


# Reporting and Visualizations

In [23]:
def generate_report(df, predictions):
    plt.figure(figsize=(12, 6))
    sns.countplot(x=predictions)
    plt.title('Distribution of Predicted Policy Violations')
    plt.savefig('predicted_violations.png')
    plt.close()

    accuracy = (predictions == df['violation_type']).mean()
    print(f"Model Accuracy: {accuracy:.2f}")

    cm = confusion_matrix(df['violation_type'], predictions)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix of Policy Violations')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig('confusion_matrix.png')
    plt.close()

generate_report(video_data, dask_predictions)

print("Project execution completed. Check the generated visualizations for insights.")

Model Accuracy: 0.84
Project execution completed. Check the generated visualizations for insights.
