# 02_clean_and_score.ipynb
Clean the 6 VTT transcripts + add hate-speech / violence / sentiment scores
(For Garon’s PhD – Carleton RA project)

In [1]:
import pandas as pd
import re
import glob
from pathlib import Path
from transformers import pipeline
import matplotlib.pyplot as plt

RAW_DIR = Path("data/raw")
master_df = pd.read_csv("master_dataset.csv")
print(f"Loaded {len(master_df)} videos from master_dataset.csv")

Loaded 200 videos from master_dataset.csv


In [2]:
def clean_vtt(vtt_path):
    text = vtt_path.read_text(encoding='utf-8')
    # Remove header & timestamps
    text = re.sub(r'WEBVTT.*?\n\n', '', text, flags=re.DOTALL)
    text = re.sub(r'\d{2}:\d{2}:\d{2}\.\d{3} --> .*?\n', '', text)
    text = re.sub(r'<.*?>', '', text)  # remove tags
    text = re.sub(r'\n+', ' ', text).strip()
    return text

# Load and clean the 6 transcripts
transcripts = []
for vtt_file in RAW_DIR.glob("*.en.vtt"):
    video_id = vtt_file.stem
    clean_text = clean_vtt(vtt_file)
    if len(clean_text) > 50:
        transcripts.append({'video_id': video_id, 'clean_transcript': clean_text})

clean_df = pd.DataFrame(transcripts)
print(f"Cleaned {len(clean_df)} transcripts")
clean_df.head()

Cleaned 6 transcripts


Unnamed: 0,video_id,clean_transcript
0,56w54X-gLig.en,[Music] [Music] e e e [Music] ...
1,3XcN3DCDqFs.en,do as they say or they will shut the do as the...
2,9LHCC4TteGU.en,but given the new more reasonable tone but giv...
3,QThBwP2KV3k.en,this segment contains sensitive imagery this s...
4,dZxvVo1pd0o.en,The Honorable member from Calgary NOS The Hono...


In [None]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

labels_hate = ["hate speech", "non-hate"]
labels_violence = ["promotes violence", "peaceful"]
labels_sentiment = ["negative", "neutral", "positive"]

results = []
for _, row in clean_df.iterrows():
    text = row['clean_transcript'][:1000]  # truncate for speed
    
    hate = classifier(text, labels_hate)['scores'][0]  # hate score
    violence = classifier(text, labels_violence)['scores'][0]
    sentiment = classifier(text, labels_sentiment)['labels'][0]
    
    results.append({
        'hate_score': round(hate, 3),
        'violence_score': round(violence, 3),
        'sentiment': sentiment
    })
    print(f"{row['video_id']}: hate={hate:.2f}, violence={violence:.2f}, {sentiment}")

scores_df = pd.DataFrame(results)
final_df = clean_df.join(scores_df)
final_df = master_df[master_df['video_id'].isin(final_df['video_id'])].merge(final_df, on='video_id')
final_df.to_csv("scored_dataset.csv", index=False)
print("\nSaved final scored_dataset.csv")
final_df[['title', 'hate_score', 'violence_score', 'sentiment']]

In [None]:
plt.figure(figsize=(8,4))
final_df['sentiment'].value_counts().plot(kind='bar', color=['red','gray','green'])
plt.title("Sentiment of 6 Canadian Far-Right Policy Transcripts")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.savefig("sentiment_chart.png", dpi=150, bbox_inches='tight')
plt.show()