In [1]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/TwitterSentimentProject')

print("‚úÖ Setup complete!")
print("Current directory:", os.getcwd())

Mounted at /content/drive
‚úÖ Setup complete!
Current directory: /content/drive/MyDrive/TwitterSentimentProject


In [2]:
# Import libraries
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

print("‚úÖ Libraries imported!")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


‚úÖ Libraries imported!


In [3]:
# Load cleaned data from Phase 1
df = pd.read_csv('data/processed/tweets_cleaned.csv')

print(f"Dataset loaded: {len(df):,} rows")
print(f"Columns: {list(df.columns)}")
print(f"\nSentiment distribution:")
print(df['sentiment'].value_counts())

# Show sample tweets
print("\nSample tweets (before cleaning):")
for i, text in enumerate(df['text'].head(3), 1):
    print(f"{i}. {text}\n")

Dataset loaded: 99,357 rows
Columns: ['sentiment', 'id', 'date', 'query', 'user', 'text', 'tweet_length', 'word_count', 'hashtags', 'mentions', 'hashtag_count', 'mention_count', 'has_url']

Sentiment distribution:
sentiment
1    49739
0    49618
Name: count, dtype: int64

Sample tweets (before cleaning):
1. @stargazer60 that's awesome 

2. @cunningstunts22  yes... i am!  

3. N my bed aaallll alone 



In [4]:
# Text cleaning functions

def remove_urls(text):
    """Remove URLs from text"""
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

def remove_mentions(text):
    """Remove @mentions"""
    return re.sub(r'@\w+', '', text)

def remove_hashtags(text):
    """Remove # symbol but keep the word"""
    return re.sub(r'#', '', text)

def remove_numbers(text):
    """Remove numbers"""
    return re.sub(r'\d+', '', text)

def remove_punctuation(text):
    """Remove punctuation"""
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_extra_spaces(text):
    """Remove extra whitespaces"""
    return ' '.join(text.split())

def to_lowercase(text):
    """Convert to lowercase"""
    return text.lower()

print("‚úÖ Cleaning functions created!")

# Test functions
sample_text = "@john Hey! Check this out https://example.com #awesome 123"
print("\nOriginal:", sample_text)
print("After URL removal:", remove_urls(sample_text))
print("After mention removal:", remove_mentions(sample_text))
print("After hashtag removal:", remove_hashtags(sample_text))
print("After number removal:", remove_numbers(sample_text))
print("After punctuation removal:", remove_punctuation(sample_text))

‚úÖ Cleaning functions created!

Original: @john Hey! Check this out https://example.com #awesome 123
After URL removal: @john Hey! Check this out  #awesome 123
After mention removal:  Hey! Check this out https://example.com #awesome 123
After hashtag removal: @john Hey! Check this out https://example.com awesome 123
After number removal: @john Hey! Check this out https://example.com #awesome 
After punctuation removal: john Hey Check this out httpsexamplecom awesome 123


In [5]:
# Apply full cleaning pipeline
def clean_text(text):
    """Complete text cleaning pipeline"""
    text = str(text)  # Ensure string
    text = to_lowercase(text)
    text = remove_urls(text)
    text = remove_mentions(text)
    text = remove_hashtags(text)
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = remove_extra_spaces(text)
    return text

print("Cleaning all tweets...")
print("This may take 1-2 minutes...\n")

# Apply cleaning
df['text_cleaned'] = df['text'].apply(clean_text)

print("‚úÖ Text cleaning complete!")

# Compare before and after
print("\n" + "="*70)
print("BEFORE vs AFTER CLEANING")
print("="*70)
for i in range(5):
    print(f"\nTweet {i+1}:")
    print(f"Before: {df['text'].iloc[i]}")
    print(f"After:  {df['text_cleaned'].iloc[i]}")

Cleaning all tweets...
This may take 1-2 minutes...

‚úÖ Text cleaning complete!

BEFORE vs AFTER CLEANING

Tweet 1:
Before: @stargazer60 that's awesome 
After:  thats awesome

Tweet 2:
Before: @cunningstunts22  yes... i am!  
After:  yes i am

Tweet 3:
Before: N my bed aaallll alone 
After:  n my bed aaallll alone

Tweet 4:
Before: @Thorney88 re guinea fowl ive heard that lots of people dislike it!  - THEY SAY ITS FOWL!!!  lol 
After:  re guinea fowl ive heard that lots of people dislike it they say its fowl lol

Tweet 5:
Before: def going to the movies tonite 
After:  def going to the movies tonite


In [6]:
# Apply full cleaning pipeline
def clean_text(text):
    """Complete text cleaning pipeline"""
    text = str(text)  # Ensure string
    text = to_lowercase(text)
    text = remove_urls(text)
    text = remove_mentions(text)
    text = remove_hashtags(text)
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = remove_extra_spaces(text)
    return text

print("Cleaning all tweets...")
print("This may take 1-2 minutes...\n")

# Apply cleaning
df['text_cleaned'] = df['text'].apply(clean_text)

print("‚úÖ Text cleaning complete!")

# Compare before and after
print("\n" + "="*70)
print("BEFORE vs AFTER CLEANING")
print("="*70)
for i in range(5):
    print(f"\nTweet {i+1}:")
    print(f"Before: {df['text'].iloc[i]}")
    print(f"After:  {df['text_cleaned'].iloc[i]}")

Cleaning all tweets...
This may take 1-2 minutes...

‚úÖ Text cleaning complete!

BEFORE vs AFTER CLEANING

Tweet 1:
Before: @stargazer60 that's awesome 
After:  thats awesome

Tweet 2:
Before: @cunningstunts22  yes... i am!  
After:  yes i am

Tweet 3:
Before: N my bed aaallll alone 
After:  n my bed aaallll alone

Tweet 4:
Before: @Thorney88 re guinea fowl ive heard that lots of people dislike it!  - THEY SAY ITS FOWL!!!  lol 
After:  re guinea fowl ive heard that lots of people dislike it they say its fowl lol

Tweet 5:
Before: def going to the movies tonite 
After:  def going to the movies tonite


In [7]:
# Get English stopwords
stop_words = set(stopwords.words('english'))

# Option to keep some important words for sentiment
# Remove negation words from stopwords (they're important for sentiment!)
negation_words = {'not', 'no', 'nor', 'neither', 'never', 'none', 'nobody', 'nothing', 'nowhere', 'hardly', 'barely', 'scarcely'}
stop_words = stop_words - negation_words

print(f"Total stopwords: {len(stop_words)}")
print(f"Sample stopwords: {list(stop_words)[:20]}")
print(f"\nKept negation words for sentiment: {negation_words}")

Total stopwords: 195
Sample stopwords: ['re', 'couldn', 'have', 'we', 'as', "he's", 'his', 'for', 'she', "you've", 'theirs', 'you', "she'd", 'most', 'under', "isn't", 'this', 'where', 'being', "you'll"]

Kept negation words for sentiment: {'not', 'none', 'hardly', 'nor', 'barely', 'nobody', 'neither', 'no', 'nothing', 'never', 'scarcely', 'nowhere'}


In [8]:
def remove_stopwords(text):
    """Remove stopwords while keeping negations"""
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

print("Removing stopwords...")

df['text_no_stopwords'] = df['text_cleaned'].apply(remove_stopwords)

print("‚úÖ Stopwords removed!")

# Compare
print("\n" + "="*70)
print("WITH vs WITHOUT STOPWORDS")
print("="*70)
for i in range(3):
    print(f"\nTweet {i+1}:")
    print(f"With stopwords: {df['text_cleaned'].iloc[i]}")
    print(f"Without:        {df['text_no_stopwords'].iloc[i]}")

Removing stopwords...
‚úÖ Stopwords removed!

WITH vs WITHOUT STOPWORDS

Tweet 1:
With stopwords: thats awesome
Without:        thats awesome

Tweet 2:
With stopwords: yes i am
Without:        yes

Tweet 3:
With stopwords: n my bed aaallll alone
Without:        n bed aaallll alone


In [9]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    """Lemmatize words to their root form"""
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

print("Lemmatizing text...")
print("This may take 2-3 minutes...\n")

df['text_lemmatized'] = df['text_no_stopwords'].apply(lemmatize_text)

print("‚úÖ Lemmatization complete!")

# Compare
print("\n" + "="*70)
print("BEFORE vs AFTER LEMMATIZATION")
print("="*70)
for i in range(3):
    print(f"\nTweet {i+1}:")
    print(f"Before: {df['text_no_stopwords'].iloc[i]}")
    print(f"After:  {df['text_lemmatized'].iloc[i]}")

Lemmatizing text...
This may take 2-3 minutes...

‚úÖ Lemmatization complete!

BEFORE vs AFTER LEMMATIZATION

Tweet 1:
Before: thats awesome
After:  thats awesome

Tweet 2:
Before: yes
After:  yes

Tweet 3:
Before: n bed aaallll alone
After:  n bed aaallll alone


In [10]:
# Calculate statistics
df['cleaned_length'] = df['text_cleaned'].apply(len)
df['cleaned_word_count'] = df['text_cleaned'].apply(lambda x: len(x.split()))
df['lemmatized_word_count'] = df['text_lemmatized'].apply(lambda x: len(x.split()))

print("="*70)
print("TEXT PROCESSING STATISTICS")
print("="*70)

stats = pd.DataFrame({
    'Stage': ['Original', 'Cleaned', 'After Stopwords', 'Lemmatized'],
    'Avg Length (chars)': [
        df['text'].apply(len).mean(),
        df['text_cleaned'].apply(len).mean(),
        df['text_no_stopwords'].apply(len).mean(),
        df['text_lemmatized'].apply(len).mean()
    ],
    'Avg Word Count': [
        df['text'].apply(lambda x: len(str(x).split())).mean(),
        df['cleaned_word_count'].mean(),
        df['text_no_stopwords'].apply(lambda x: len(x.split())).mean(),
        df['lemmatized_word_count'].mean()
    ]
})

print(stats.to_string(index=False))

TEXT PROCESSING STATISTICS
          Stage  Avg Length (chars)  Avg Word Count
       Original           74.422547       13.236491
        Cleaned           62.195708       12.441046
After Stopwords           43.601759        7.305998
     Lemmatized           43.061455        7.305998


In [None]:
# Visualize cleaning impact
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Word count comparison
stages = ['Original', 'Cleaned', 'No Stopwords', 'Lemmatized']
word_counts = [
    df['text'].apply(lambda x: len(str(x).split())).mean(),
    df['cleaned_word_count'].mean(),
    df['text_no_stopwords'].apply(lambda x: len(x.split())).mean(),
    df['lemmatized_word_count'].mean()
]

axes[0, 0].bar(stages, word_counts, color=['#3498db', '#2ecc71', '#f39c12', '#e74c3c'], alpha=0.8)
axes[0, 0].set_ylabel('Average Word Count', fontsize=12)
axes[0, 0].set_title('Impact of Preprocessing on Word Count', fontsize=14, fontweight='bold')
axes[0, 0].grid(axis='y', alpha=0.3)
for i, v in enumerate(word_counts):
    axes[0, 0].text(i, v + 0.5, f'{v:.1f}', ha='center', fontweight='bold')

# 2. Distribution of word counts (cleaned)
axes[0, 1].hist(df[df['sentiment']==0]['cleaned_word_count'],
                bins=30, alpha=0.6, label='Negative', color='#e74c3c')
axes[0, 1].hist(df[df['sentiment']==1]['cleaned_word_count'],
                bins=30, alpha=0.6, label='Positive', color='#2ecc71')
axes[0, 1].set_xlabel('Word Count', fontsize=12)
axes[0, 1].set_ylabel('Frequency', fontsize=12)
axes[0, 1].set_title('Word Count Distribution After Cleaning', fontsize=14, fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# 3. Text length comparison
axes[1, 0].boxplot([
    df['text'].apply(len),
    df['text_cleaned'].apply(len),
    df['text_no_stopwords'].apply(len),
    df['text_lemmatized'].apply(len)
], labels=stages)
axes[1, 0].set_ylabel('Character Length', fontsize=12)
axes[1, 0].set_title('Text Length at Different Stages', fontsize=14, fontweight='bold')
axes[1, 0].grid(axis='y', alpha=0.3)

# 4. Sample comparison table
axes[1, 1].axis('off')
comparison_text = """
PREPROCESSING PIPELINE SUMMARY

Stages Applied:
1. Lowercase conversion
2. URL removal
3. @mention removal
4. Hashtag symbol removal
5. Number removal
6. Punctuation removal
7. Extra space removal
8. Stopword removal (kept negations)
9. Lemmatization

Results:
- Average word reduction: {:.1f}%
- Cleaned tweets: {:,}
- Ready for feature extraction

Next Steps (Person 2):
‚Üí TF-IDF vectorization
‚Üí Feature engineering
""".format(
    (1 - df['lemmatized_word_count'].mean() / df['text'].apply(lambda x: len(str(x).split())).mean()) * 100,
    len(df)
)

axes[1, 1].text(0.1, 0.5, comparison_text,
                fontsize=11, family='monospace',
                verticalalignment='center',
                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.3))

plt.tight_layout()
plt.savefig('results/figures/07_person1_text_cleaning.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Saved: results/figures/07_person1_text_cleaning.png")

In [11]:
# Save preprocessed data for Person 2 and Person 3
output_file = 'data/processed/tweets_preprocessed_person1.csv'

# Select relevant columns
df_output = df[['sentiment', 'text', 'text_cleaned', 'text_no_stopwords',
                'text_lemmatized', 'cleaned_word_count', 'lemmatized_word_count']]

df_output.to_csv(output_file, index=False)

file_size = os.path.getsize(output_file) / (1024**2)

print(f"‚úÖ Preprocessed data saved!")
print(f"üìÅ Location: {output_file}")
print(f"üíæ Size: {file_size:.2f} MB")
print(f"üìä Rows: {len(df_output):,}")
print(f"üìã Columns: {list(df_output.columns)}")

print("\n" + "="*70)
print("‚úÖ PERSON 1 TASK COMPLETE!")
print("="*70)
print("Deliverables:")
print("  ‚Ä¢ Cleaned and preprocessed text")
print("  ‚Ä¢ Removed URLs, mentions, numbers, punctuation")
print("  ‚Ä¢ Removed stopwords (kept negations)")
print("  ‚Ä¢ Lemmatized text")
print("  ‚Ä¢ Saved processed data for team")
print("  ‚Ä¢ Created visualization")

‚úÖ Preprocessed data saved!
üìÅ Location: data/processed/tweets_preprocessed_person1.csv
üíæ Size: 22.27 MB
üìä Rows: 99,357
üìã Columns: ['sentiment', 'text', 'text_cleaned', 'text_no_stopwords', 'text_lemmatized', 'cleaned_word_count', 'lemmatized_word_count']

‚úÖ PERSON 1 TASK COMPLETE!
Deliverables:
  ‚Ä¢ Cleaned and preprocessed text
  ‚Ä¢ Removed URLs, mentions, numbers, punctuation
  ‚Ä¢ Removed stopwords (kept negations)
  ‚Ä¢ Lemmatized text
  ‚Ä¢ Saved processed data for team
  ‚Ä¢ Created visualization


In [15]:
!git config --global user.name "zeynkash"  # Replace with your name
!git config --global user.email "030721077@std.izu.edu.tr"

In [16]:
%cd /content

/content


In [18]:
!git clone https://github.com/zeynkash/twitter-sentiment-analysis.git

Cloning into 'twitter-sentiment-analysis'...
remote: Enumerating objects: 19, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 19 (delta 0), reused 16 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (19/19), 3.61 MiB | 16.26 MiB/s, done.


In [19]:
%cd /content/twitter-sentiment-analysis


/content/twitter-sentiment-analysis


In [None]:
!cp /content/drive/MyDrive/TwitterSentimentProject/notebooks/02_Phase2_Person1_TextCleaning.ipynb notebooks/
