In [8]:
!pip install nltk vaderSentiment

Defaulting to user installation because normal site-packages is not writeable


In [10]:
# In a real Jupyter Notebook, each commented section would be a separate cell.

# --- Cell 1: Install Dependencies & Import Libraries ---
!pip install pandas nltk vaderSentiment

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Download stopwords if you haven't already
try:
    stopwords.words('english')
except LookupError:
    print("Downloading NLTK stopwords...")
    nltk.download('stopwords')
    print("Download complete.")

print("Libraries imported successfully.")


# --- Cell 2: Load the Dataset (with Header Fix) ---
file_path = '../data/raw/Twitter_Data.csv' 

# --- THE FIX IS HERE ---
# We tell pandas there is no header and we provide our own column names.
# The tweet text is in the 4th column, which we will name 'text'.
try:
    df = pd.read_csv(file_path, header=None, names=['id', 'topic', 'sentiment_label_orig', 'text'])
    print("Dataset loaded successfully with custom headers.")
    
    # This is the column we will process.
    TEXT_COLUMN_NAME = 'text' 
    
    # Drop rows where the main text column is empty
    df.dropna(subset=[TEXT_COLUMN_NAME], inplace=True)
    print(f"\nShape of the dataset: {df.shape}")
    print("\n--- Sample of Loaded Data ---")
    display(df.head())

except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    df = None
except Exception as e:
    print(f"An error occurred: {e}")
    df = None


# --- Cell 3: Define Text Preprocessing Functions ---
if df is not None:
    stop_words = set(stopwords.words('english'))

    def preprocess_text(text):
        if not isinstance(text, str): return ""
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'\@\w+|\#','', text)
        text = re.sub(r'[^\w\s]', '', text)
        text = text.lower()
        text_tokens = text.split()
        filtered_words = [word for word in text_tokens if word not in stop_words]
        return " ".join(filtered_words)

    print("\nPreprocessing function defined.")


# --- Cell 4: Apply Preprocessing to the DataFrame ---
if df is not None:
    print(f"\nApplying preprocessing to '{TEXT_COLUMN_NAME}' column...")
    df['processed_text'] = df[TEXT_COLUMN_NAME].apply(preprocess_text)
    print("Preprocessing complete.")
    
    print("\n--- Text Comparison ---")
    display(df[[TEXT_COLUMN_NAME, 'processed_text']].head())


# --- Cell 5: Perform Sentiment Analysis with VADER ---
if df is not None:
    analyzer = SentimentIntensityAnalyzer()
    df['vader_score'] = df['processed_text'].apply(lambda text: analyzer.polarity_scores(text)['compound'])
    print("\nVADER analysis complete.")


# --- Cell 6: Categorize VADER Scores ---
if df is not None:
    def categorize_sentiment(score):
        if score > 0.05: return 'Positive'
        elif score < -0.05: return 'Negative'
        else: return 'Neutral'

    df['vader_label'] = df['vader_score'].apply(categorize_sentiment)
    print("\n--- VADER Label Distribution ---")
    print(df['vader_label'].value_counts())


# --- Cell 7: Save the Processed Data ---
if df is not None:
    # Create a dummy date range
    num_records = len(df)
    start_date = "2024-01-01"
    dummy_dates = pd.to_datetime(pd.date_range(start=start_date, periods=num_records, freq='H'))
    df['date'] = dummy_dates
    
    # Select and rename columns for the final output file.
    final_df = df[['date', 'text', 'processed_text', 'vader_score', 'vader_label']].copy()
    
    output_path = '../data/processed/cleaned_sentiment_data.csv'
    
    try:
        final_df.to_csv(output_path, index=False)
        print(f"\nProcessed data successfully saved to: {output_path}")
        print("\n--- Final Processed Data Sample ---")
        display(final_df.head())
    except Exception as e:
        print(f"Error saving file: {e}")



Defaulting to user installation because normal site-packages is not writeable
Libraries imported successfully.
Dataset loaded successfully with custom headers.

Shape of the dataset: (73996, 4)

--- Sample of Loaded Data ---


Unnamed: 0,id,topic,sentiment_label_orig,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...



Preprocessing function defined.

Applying preprocessing to 'text' column...
Preprocessing complete.

--- Text Comparison ---


Unnamed: 0,text,processed_text
0,im getting on borderlands and i will murder yo...,im getting borderlands murder
1,I am coming to the borders and I will kill you...,coming borders kill
2,im getting on borderlands and i will kill you ...,im getting borderlands kill
3,im coming on borderlands and i will murder you...,im coming borderlands murder
4,im getting on borderlands 2 and i will murder ...,im getting borderlands 2 murder



VADER analysis complete.

--- VADER Label Distribution ---
vader_label
Positive    33984
Negative    26616
Neutral     13396
Name: count, dtype: int64


  dummy_dates = pd.to_datetime(pd.date_range(start=start_date, periods=num_records, freq='H'))



Processed data successfully saved to: ../data/processed/cleaned_sentiment_data.csv

--- Final Processed Data Sample ---


Unnamed: 0,date,text,processed_text,vader_score,vader_label
0,2024-01-01 00:00:00,im getting on borderlands and i will murder yo...,im getting borderlands murder,-0.6908,Negative
1,2024-01-01 01:00:00,I am coming to the borders and I will kill you...,coming borders kill,-0.6908,Negative
2,2024-01-01 02:00:00,im getting on borderlands and i will kill you ...,im getting borderlands kill,-0.6908,Negative
3,2024-01-01 03:00:00,im coming on borderlands and i will murder you...,im coming borderlands murder,-0.6908,Negative
4,2024-01-01 04:00:00,im getting on borderlands 2 and i will murder ...,im getting borderlands 2 murder,-0.6908,Negative
