In [1]:
# Text Preprocessing - Customer Feedback Analytics
# This notebook demonstrates the text preprocessing pipeline for customer reviews.

# 1. Import libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import spacy
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import warnings

warnings.filterwarnings('ignore')

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
print("Libraries loaded successfully!")

# 2. Import custom preprocessor
import sys
sys.path.append('..')
from src.data_processing.preprocessor import TextPreprocessor

preprocessor = TextPreprocessor()

# 3. Load Raw Data
df = pd.read_csv('../data/raw/customer_reviews.csv')
print(f"Loaded {len(df)} reviews")
display(df.head())

# 4. Text Cleaning Steps (demonstration on one sample)
sample_text = df['review_text'].iloc[0]
print("Original text:")
print(sample_text)
print("\n" + "="*50 + "\n")

# Step 1: Basic cleaning
cleaned_text = preprocessor.clean_text(sample_text)
print("After basic cleaning:")
print(cleaned_text)
print("\n" + "="*50 + "\n")

# Step 2: Handle negations
negation_handled = preprocessor.handle_negations(cleaned_text)
print("After handling negations:")
print(negation_handled)
print("\n" + "="*50 + "\n")

# Step 3: Remove stopwords
no_stopwords = preprocessor.remove_stopwords(negation_handled)
print("After removing stopwords:")
print(no_stopwords)
print("\n" + "="*50 + "\n")

# Step 4: Lemmatization
lemmatized = preprocessor.lemmatize_text(no_stopwords)
print("After lemmatization:")
print(lemmatized)

# 5. Apply Preprocessing to Dataset
print("Applying preprocessing pipeline...")
df_processed = preprocessor.preprocess_dataset(df)
print(f"\nPreprocessing complete!\nOriginal shape: {df.shape}\nProcessed shape: {df_processed.shape}")
print(f"New columns added: {set(df_processed.columns) - set(df.columns)}")

# 6. Feature Analysis
feature_cols = [col for col in df_processed.columns if col not in df.columns]
print(f"New features created: {len(feature_cols)}")
print("\nFeature categories:")
print("- Text statistics:", [col for col in feature_cols if 'count' in col or 'length' in col or 'ratio' in col])
print("- Sentiment features:", [col for col in feature_cols if 'textblob' in col or 'positive' in col or 'negative' in col])
print("- Time features:", [col for col in feature_cols if 'hour' in col or 'dayofweek' in col or 'month' in col])

feature_stats = df_processed[feature_cols].describe()
display(feature_stats.round(2))

# 7. Text Transformation Analysis
comparison_df = pd.DataFrame({
    'original': df['review_text'].head(5),
    'cleaned': df_processed['cleaned_text'].head(5)
})
for idx, row in comparison_df.iterrows():
    print(f"\nReview {idx + 1}:\nOriginal: {row['original'][:100]}...\nCleaned: {row['cleaned'][:100]}...\n" + "-"*80)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))
df['original_word_count'] = df['review_text'].str.split().str.len()
axes[0].hist(df['original_word_count'], bins=50, color='skyblue', edgecolor='black')
axes[0].set_xlabel('Word Count')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Original Text Word Count Distribution')
axes[0].axvline(df['original_word_count'].mean(), color='red', linestyle='--', label=f'Mean: {df["original_word_count"].mean():.1f}')
axes[0].legend()
axes[1].hist(df_processed['word_count'], bins=50, color='lightgreen', edgecolor='black')
axes[1].set_xlabel('Word Count')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Cleaned Text Word Count Distribution')
axes[1].axvline(df_processed['word_count'].mean(), color='red', linestyle='--', label=f'Mean: {df_processed["word_count"].mean():.1f}')
axes[1].legend()
plt.tight_layout()
plt.show()
print(f"Average reduction in word count: {(1 - df_processed['word_count'].mean() / df['original_word_count'].mean()) * 100:.1f}%")

# 8. Sentiment Feature Analysis
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
axes[0].hist(df_processed['textblob_polarity'], bins=50, color='purple', alpha=0.7, edgecolor='black')
axes[0].set_xlabel('Polarity Score')
axes[0].set_ylabel('Frequency')
axes[0].set_title('TextBlob Polarity Distribution')
axes[0].axvline(0, color='red', linestyle='--', label='Neutral')
axes[0].legend()
axes[1].hist(df_processed['textblob_subjectivity'], bins=50, color='orange', alpha=0.7, edgecolor='black')
axes[1].set_xlabel('Subjectivity Score')
axes[1].set_ylabel('Frequency')
axes[1].set_title('TextBlob Subjectivity Distribution')
axes[1].axvline(0.5, color='red', linestyle='--', label='Mid-point')
axes[1].legend()
plt.tight_layout()
plt.show()

sentiment_features = ['textblob_polarity', 'positive_word_count', 'negative_word_count']
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for idx, feature in enumerate(sentiment_features):
    df_processed.boxplot(column=feature, by='true_sentiment', ax=axes[idx])
    axes[idx].set_title(f'{feature} by True Sentiment')
    axes[idx].set_xlabel('True Sentiment')
    axes[idx].set_ylabel(feature)
plt.suptitle('')
plt.tight_layout()
plt.show()

# 9. Special Character Analysis
punct_features = ['exclamation_count', 'question_count', 'caps_ratio']
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for idx, feature in enumerate(punct_features):
    sentiment_groups = df_processed.groupby('true_sentiment')[feature].mean().sort_values()
    colors = {'positive': '#2ECC71', 'negative': '#E74C3C', 'neutral': '#95A5A6'}
    bar_colors = [colors.get(x, 'gray') for x in sentiment_groups.index]
    axes[idx].bar(sentiment_groups.index, sentiment_groups.values, color=bar_colors)
    axes[idx].set_xlabel('Sentiment')
    axes[idx].set_ylabel(f'Average {feature}')
    axes[idx].set_title(f'{feature} by Sentiment')
plt.tight_layout()
plt.show()

# 10. Word Cloud Comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
for idx, sentiment in enumerate(['positive', 'negative']):
    text = ' '.join(df[df['true_sentiment'] == sentiment]['review_text'].dropna())
    wordcloud = WordCloud(width=600, height=400, background_color='white',
                         colormap='Greens' if sentiment == 'positive' else 'Reds').generate(text)
    axes[0, idx].imshow(wordcloud, interpolation='bilinear')
    axes[0, idx].set_title(f'Original - {sentiment.capitalize()} Reviews')
    axes[0, idx].axis('off')
for idx, sentiment in enumerate(['positive', 'negative']):
    text = ' '.join(df_processed[df_processed['true_sentiment'] == sentiment]['cleaned_text'].dropna())
    wordcloud = WordCloud(width=600, height=400, background_color='white',
                         colormap='Greens' if sentiment == 'positive' else 'Reds').generate(text)
    axes[1, idx].imshow(wordcloud, interpolation='bilinear')
    axes[1, idx].set_title(f'Cleaned - {sentiment.capitalize()} Reviews')
    axes[1, idx].axis('off')
plt.suptitle('Word Clouds: Original vs Cleaned Text', fontsize=16)
plt.tight_layout()
plt.show()

# 11. Feature Correlation Analysis
corr_features = ['rating', 'word_count', 'exclamation_count', 'question_count',
                'caps_ratio', 'textblob_polarity', 'textblob_subjectivity',
                'positive_word_count', 'negative_word_count']
corr_matrix = df_processed[corr_features].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# 12. Save Preprocessed Data and Stats
output_path = '../data/processed/preprocessed_reviews.csv'
df_processed.to_csv(output_path, index=False)
print(f"Preprocessed data saved to: {output_path}")

preprocessing_stats = {
    'original_reviews': len(df),
    'processed_reviews': len(df_processed),
    'reviews_filtered': len(df) - len(df_processed),
    'avg_word_reduction': (1 - df_processed['word_count'].mean() / df['original_word_count'].mean()) * 100,
    'features_added': len(feature_cols),
    'avg_textblob_polarity': df_processed['textblob_polarity'].mean(),
    'avg_textblob_subjectivity': df_processed['textblob_subjectivity'].mean()
}
import json, os
if not os.path.exists('../results'):
    os.makedirs('../results')
with open('../results/preprocessing_stats.json', 'w') as f:
    json.dump(preprocessing_stats, f, indent=2)
print("\nPreprocessing statistics:")
for key, value in preprocessing_stats.items():
    print(f"  {key}: {value}")




Libraries loaded successfully!


FileNotFoundError: [Errno 2] No such file or directory: 'config/config.yaml'