# Install Libraries

In [None]:
!pip install transformers torch torchvision
!pip install scikit-learn pandas numpy matplotlib seaborn
!pip install nltk wordcloud textblob

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
nltk.download('stopwords')
from transformers import pipeline

In [None]:
df = pd.read_csv('/content/Youtube-Spam-Dataset.csv')
print(df)

In [None]:
print(df.shape)

# Text Preprocessing

In [None]:
def clean_text(text):
    if pd.isna(text):
        return ''
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
df['CLEAN_CONTENT'] = df['CONTENT'].apply(clean_text)
df = df[df['CLEAN_CONTENT'].str.len() > 0]

In [None]:
print(df['CLEAN_CONTENT'])

# Load Classifier

In [None]:
classifier = pipeline('text-classification',model='martin-ha/toxic-comment-model')

In [None]:
spam_patterns = [
    r'subscribe', r'channel', r'check out', r'follow me',
    r'my channel', r'visit', r'website', r'\.com', r'www'
]

In [None]:
def create_spam_labels(df):
  results = []

  for i, content in enumerate(df['CLEAN_CONTENT']):

   try:
    toxic_result = classifier(content)[0]
    toxicity = toxic_result['score'] if toxic_result['label'] == 'TOXIC' else 0

    pattern_matches = sum(1 for pattern in spam_patterns
                          if re.search(pattern, content , re.IGNORECASE))

    if pattern_matches >= 2:
        spam_label = 1
    elif pattern_matches >= 1 and toxicity > 0.3:
        spam_label = 1
    elif toxicity > 0.7:
        spam_label = 1
    else:
        spam_label = 0

   except:
      spam_label = 0

   results.append(spam_label)

  return results

In [None]:
df['CLASS_LABEL'] = create_spam_labels(df)

In [None]:
print(df['category'])

# Basic Stats

In [None]:
spam_count = (df['CLASS_LABEL'] == 1).sum()
clean_count = (df['CLASS_LABEL'] == 0).sum()
total_count = len(df)
spam_rate = spam_count / total_count * 100

In [None]:
print(f"Total: {total_count:,} | Spam: {spam_count:,} ({spam_rate:.1f}%)")

# Create Charts

In [None]:
plt.figure(figsize=(15, 5))

Pie Chart

In [None]:
plt.subplot(1, 2, 1)
labels = ['Clean Comments', 'Spam Comments']
sizes = [clean_count, spam_count]
colors = ['#2ecc71', '#e74c3c']
explode = (0, 0.1)

In [None]:
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%',
        startangle=90, explode=explode)
plt.title('Comment Distribution', fontweight='bold')