# Feature Engineering

## Objective
Transform raw text data into numerical features suitable for machine learning models.

## Approach
1. Text preprocessing (cleaning, normalization)
2. Feature extraction (TF-IDF, embeddings)
3. Combine text and metadata features
4. Save feature matrices for model training

In [None]:
# Imports
import pandas as pd
import numpy as np
from pathlib import Path
import re
import pickle

# NLP
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (run once)
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

## 1. Load Data

In [None]:
# Load data from EDA
data_path = Path('../../../contracts/mock-data/requests.csv')
df = pd.read_csv(data_path)
print(f"Loaded {len(df)} requests")
df.head()

## 2. Text Preprocessing

### Define Preprocessing Functions

In [None]:
def preprocess_text(text):
    """
    Preprocess text: lowercase, remove special chars, remove stopwords, lemmatize
    
    TODO: Adjust preprocessing steps based on experimentation
    """
    if pd.isna(text):
        return ""
    
    # Lowercase
    text = text.lower()
    
    # Remove special characters and digits (keep letters and spaces)
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    
    return ' '.join(tokens)

### Apply Preprocessing

In [None]:
# Combine title and description
df['combined_text'] = df['title'] + ' ' + df['description']

# Preprocess
print("Preprocessing text...")
df['processed_text'] = df['combined_text'].apply(preprocess_text)

# Check samples
print("\nOriginal vs. Processed:")
for i in range(3):
    print(f"\nOriginal: {df['combined_text'].iloc[i][:100]}")
    print(f"Processed: {df['processed_text'].iloc[i][:100]}")

## 3. Feature Extraction

### TF-IDF Features

In [None]:
# TF-IDF Vectorizer
# TODO: Tune parameters (max_features, ngram_range, min_df, max_df)
tfidf = TfidfVectorizer(
    max_features=1000,  # Limit to top 1000 features
    ngram_range=(1, 2),  # Unigrams and bigrams
    min_df=2,  # Ignore terms that appear in < 2 documents
    max_df=0.8  # Ignore terms that appear in > 80% of documents
)

# Fit and transform
X_tfidf = tfidf.fit_transform(df['processed_text'])

print(f"TF-IDF feature matrix shape: {X_tfidf.shape}")
print(f"Feature names (sample): {tfidf.get_feature_names_out()[:20]}")

In [None]:
# Inspect top features by category
# TODO: Use this to validate features make sense
feature_names = tfidf.get_feature_names_out()

for category in df['category'].unique():
    print(f"\n=== Top features for {category} ===")
    category_indices = df[df['category'] == category].index
    category_tfidf = X_tfidf[category_indices].mean(axis=0).A1
    top_indices = category_tfidf.argsort()[-10:][::-1]
    top_features = [feature_names[i] for i in top_indices]
    print(top_features)

### Optional: Word Embeddings (Advanced)

In [None]:
# TODO: If using word embeddings (Word2Vec, GloVe, or pre-trained transformers)
# Example with sentence transformers:
# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer('all-MiniLM-L6-v2')
# X_embeddings = model.encode(df['combined_text'].tolist())
# print(f"Embedding shape: {X_embeddings.shape}")

## 4. Encode Target Variables

In [None]:
# Encode category
category_encoder = LabelEncoder()
y_category = category_encoder.fit_transform(df['category'])
print(f"Category classes: {category_encoder.classes_}")

# Encode priority
priority_encoder = LabelEncoder()
y_priority = priority_encoder.fit_transform(df['priority'])
print(f"Priority classes: {priority_encoder.classes_}")

## 5. Save Processed Data and Encoders

In [None]:
# Create output directory
output_dir = Path('../data/processed')
output_dir.mkdir(parents=True, exist_ok=True)

# Save TF-IDF features
import scipy.sparse
scipy.sparse.save_npz(output_dir / 'X_tfidf.npz', X_tfidf)

# Save target variables
np.save(output_dir / 'y_category.npy', y_category)
np.save(output_dir / 'y_priority.npy', y_priority)

# Save encoders and vectorizer for later use
with open(output_dir / 'tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)
    
with open(output_dir / 'category_encoder.pkl', 'wb') as f:
    pickle.dump(category_encoder, f)
    
with open(output_dir / 'priority_encoder.pkl', 'wb') as f:
    pickle.dump(priority_encoder, f)

print("\nSaved processed data and encoders to:", output_dir)

## 6. Additional Features (Optional)

In [None]:
# TODO: Add metadata features if available
# Examples:
# - Title length
# - Description length
# - Time of day (hour)
# - Day of week
# - Presence of urgent keywords

# df['title_length'] = df['title'].str.len()
# df['desc_length'] = df['description'].str.len()
# df['has_urgent'] = df['combined_text'].str.contains('urgent|asap|critical', case=False).astype(int)

## 7. Feature Summary

**Created Features**:
- TF-IDF vectors (1000 features, unigrams + bigrams)
- Encoded category labels
- Encoded priority labels

**Saved Artifacts**:
- `X_tfidf.npz`: Feature matrix
- `y_category.npy`: Category labels
- `y_priority.npy`: Priority labels
- `tfidf_vectorizer.pkl`: Fitted vectorizer for inference
- `category_encoder.pkl`: Category label encoder
- `priority_encoder.pkl`: Priority label encoder

TODO: Document any additional features or preprocessing steps

## 8. Next Steps

1. Proceed to `03-model-training.ipynb`
2. Load processed features
3. Split into train/test sets
4. Train classification models
5. Evaluate performance in `04-model-evaluation.ipynb`