In [25]:
import os
import pandas as pd
import time
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from datetime import timedelta, datetime

In [26]:
# Configuration
input_folder = "comments/preprocessed_comments"
output_folder = "comments/preprocessed_with_vader"  # New folder for VADER results
os.makedirs(output_folder, exist_ok=True)

# Initialize VADER analyzer
analyzer = SentimentIntensityAnalyzer()

# Get all CSV files
all_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]
total_files = len(all_files)

print(f"Found {total_files} CSV files to process")
print(f"Output folder: {output_folder}")


Found 10286 CSV files to process
Output folder: comments/preprocessed_with_vader


In [27]:
start_time = time.time()
no_comment_column = []
processed_count = 0
skipped_count = 0

for idx, filename in enumerate(all_files, 1):
    # Create output filename with _vader suffix
    base_name = filename.replace('.csv', '')
    output_filename = f"{base_name}_vader.csv"
    output_path = os.path.join(output_folder, output_filename)

    if os.path.exists(output_path):
        skipped_count += 1
        continue
    
    file_path = os.path.join(input_folder, filename)
    
    try:
        df = pd.read_csv(file_path)
        df.columns = df.columns.str.strip()
        
        if 'preprocessed_comment' not in df.columns:
            print(f"Warning: No 'preprocessed_comment' column found in {filename}. Available columns: {list(df.columns)}")
            no_comment_column.append(filename)
            continue

        vader_scores = df['preprocessed_comment'].apply(
            lambda comment: analyzer.polarity_scores(str(comment)) 
            if pd.notna(comment) and str(comment).strip() != "" 
            else {'compound': 0, 'pos': 0, 'neu': 0, 'neg': 0}
        )

        vader_scores_df = pd.DataFrame(list(vader_scores))

        vader_scores_df.columns = ['vader_' + col for col in vader_scores_df.columns]

        for col in vader_scores_df.columns:
            df[col] = vader_scores_df[col]
        
        base_name = filename.replace('.csv', '')
        output_filename = f"{base_name}_vader.csv"
        output_path = os.path.join(output_folder, output_filename)
        df.to_csv(output_path, index=False)
        
        processed_count += 1
        
        if idx % 1000 == 0 or idx == total_files:
            elapsed = time.time() - start_time
            processed_so_far = processed_count
            if processed_so_far > 0:
                avg_per_file = elapsed / processed_so_far
                remaining_files = total_files - idx
                est_remaining = avg_per_file * remaining_files
                finish_time = datetime.now() + timedelta(seconds=est_remaining)
                print(f"files processed: {processed_so_far}/{len(all_files)}, expected finishing time: {finish_time}")
    except Exception as e:
        print(f"Error processing {filename}: {e}")
        no_comment_column.append(filename)


files processed: 1000/10286, expected finishing time: 2025-08-14 14:18:15.974055
files processed: 2000/10286, expected finishing time: 2025-08-14 14:18:19.516964
files processed: 3000/10286, expected finishing time: 2025-08-14 14:18:21.548763
files processed: 4000/10286, expected finishing time: 2025-08-14 14:18:21.562048
files processed: 5000/10286, expected finishing time: 2025-08-14 14:18:21.116259
files processed: 6000/10286, expected finishing time: 2025-08-14 14:18:27.510720
files processed: 7000/10286, expected finishing time: 2025-08-14 14:18:28.186750
files processed: 8000/10286, expected finishing time: 2025-08-14 14:18:28.268126
files processed: 9000/10286, expected finishing time: 2025-08-14 14:18:29.136782
files processed: 10000/10286, expected finishing time: 2025-08-14 14:18:29.519997
files processed: 10286/10286, expected finishing time: 2025-08-14 14:18:29.539942
