<a href="https://colab.research.google.com/github/thatswhatmeetcoded/Sentiment-Classification/blob/main/decision_tree/2_text_preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# text_preprocessing.ipynb

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load raw data from Drive
raw_data_path = '/content/drive/MyDrive/raw_combined_data.csv'
df_all = pd.read_csv(raw_data_path)

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Preprocessing setup
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define text cleaning function
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r"http\S+|www.\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+|#\w+", "", text)  # Remove mentions and hashtags
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    words = text.split()  # Tokenize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatize and remove stopwords
    return " ".join(words)

# Apply text cleaning
df_all['clean_text'] = df_all['text'].apply(clean_text)

# Drop rows where clean_text is empty
df_all = df_all.dropna(subset=['clean_text'])
df_all = df_all[df_all['clean_text'].str.strip().astype(bool)]

# Save cleaned data to Google Drive
cleaned_data_path = '/content/drive/MyDrive/clean_data.csv'
df_all.to_csv(cleaned_data_path, index=False)

print(f"Cleaned data saved to: {cleaned_data_path}")
print(f"Shape after cleaning: {df_all.shape}")
df_all[['text', 'clean_text', 'sentiment']].head()

def get_clean_text_function():
    return clean_text


Mounted at /content/drive


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Cleaned data saved to: /content/drive/MyDrive/clean_data.csv
Shape after cleaning: (516, 9)


Unnamed: 0,text,clean_text,sentiment
0,@stellargirl I loooooooovvvvvveee my Kindle2. ...,loooooooovvvvvveee kindle2 dx cool 2 fantastic...,positive
1,Reading my kindle2... Love it... Lee childs i...,reading kindle2 love lee child good read,positive
2,"Ok, first assesment of the #kindle2 ...it fuck...",ok first assesment fucking rock,positive
3,@kenburbary You'll love your Kindle2. I've had...,youll love kindle2 ive mine month never looked...,positive
4,@mikefish Fair enough. But i have the Kindle2...,fair enough kindle2 think perfect,positive
