##### =============================================================================
##### FILE: src/preprocess.ipynb
##### PROJECT: Future Revenue Forecasting (FDE)
##### DESCRIPTION: Full preprocessing + NLP + PCA + Feature Selection pipeline
#### =============================================================================

In [63]:
# 📦 Step 1: Import all dependencies
import os
import logging
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.base import BaseEstimator, TransformerMixin
from joblib import dump
from scipy.sparse import hstack

# NLTK downloads
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [64]:
import os
import logging

# Define log path (one level up from 'src')
log_file_path = os.path.join("..", "logs", "preprocessing.log")

# Make sure the logs folder exists
os.makedirs(os.path.dirname(log_file_path), exist_ok=True)

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    handlers=[
        logging.FileHandler(log_file_path),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger(__name__)
logger.info("Logging initialized. Logs will be saved in ../logs/preprocessing.log")


2025-10-31 11:22:41,874 | INFO | Logging initialized. Logs will be saved in ../logs/preprocessing.log


In [65]:
# ✨ Step 3: Custom text preprocessing class
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, text_col='line_item_description'):
        self.text_col = text_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """Process each row individually, maintaining DataFrame structure"""
        if isinstance(X, pd.DataFrame):
            # Process each description separately
            cleaned = X[self.text_col].apply(self._clean_text)
            return pd.DataFrame({self.text_col: cleaned})
        return X
    
    def _clean_text(self, text):
        """Clean a single text string"""
        if pd.isna(text) or text == "":
            return ""
        text = str(text).lower()
        text = re.sub(r'[^a-z\s]', '', text)  # keep only letters and spaces
        text = ' '.join(text.split())  # normalize whitespace
        return text


In [66]:
# 📂 Step 4: Load data
def load_data(file_path='../data/raw/revenue_data.csv'):
    logger.info("Loading dataset from %s", file_path)
    df = pd.read_csv(file_path)
    logger.info("Initial shape: %s", df.shape)
    return df

df = load_data()
df.head()


2025-10-31 11:22:41,897 | INFO | Loading dataset from ../data/raw/revenue_data.csv
2025-10-31 11:22:41,922 | INFO | Initial shape: (21600, 10)


Unnamed: 0,month,profit_center,region,product_category,line_item_description,year,month_int,quarter,is_qtr_end,revenue
0,2024-01-01,Product_A,North America,Professional Services,Installation Fee,2024,1,1,0,3461.81
1,2024-01-01,Service_C,North America,Professional Services,Installation Fee,2024,1,1,0,3825.41
2,2024-01-01,Product_B,EMEA,Hardware Sales,Standard Monthly Fee,2024,1,1,0,5632.22
3,2024-01-01,Service_C,North America,Software Subscription,Legacy Product Support,2024,1,1,0,3868.49
4,2024-01-01,Product_A,APAC,Maintenance,Legacy Product Support,2024,1,1,0,6184.16


In [67]:
# 🧹 Step 5: Build preprocessing pipeline (structured + text)
def build_pipeline():
    logger.info("Building preprocessing + PCA + feature selection pipeline...")
    
    categorical_features = ['region', 'product_category']
    numeric_features = ['revenue', 'month_int', 'year', 'quarter']
    text_feature = 'line_item_description'

    # ---- Structured Pipelines ----
    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    numeric_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=2))  # reduce dimensionality for numeric
    ])

    structured_pipeline = ColumnTransformer(transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

    # ---- Text Pipeline ----
    text_pipeline = Pipeline([
        ('cleaner', TextPreprocessor(text_col=text_feature)),
        ('tfidf', TfidfVectorizer(max_features=1000)),
        ('selector', SelectKBest(chi2, k=300))  # feature selection
    ])

    return structured_pipeline, text_pipeline, text_feature


In [68]:
df.columns

Index(['month', 'profit_center', 'region', 'product_category',
       'line_item_description', 'year', 'month_int', 'quarter', 'is_qtr_end',
       'revenue'],
      dtype='object')

In [69]:
# def run_preprocessing(input_path='../data/raw/revenue_data.csv',
#                       output_clean='../data/processed/pre_processed_data.csv',
#                       pipeline_path='../models/preprocessing_pipeline.pkl'):
#     logger.info("Starting preprocessing workflow...")

#     df = load_data(input_path)
#     df['line_item_description'] = df['line_item_description'].fillna("")

#     structured_features = df.drop(columns=['line_item_description'])
#     text_data = df['line_item_description']  # ✅ Pass Series, not DataFrame

#     structured_pipeline, text_pipeline, text_col = build_pipeline()

#     logger.info("Fitting structured pipeline...")
#     structured_pipeline.fit(structured_features)

#     # ✅ Fit text pipeline with Series
#     y_dummy = np.random.randint(0, 2, size=len(df))
#     text_pipeline.fit(text_data, y_dummy)

#     # ✅ Transform both parts
#     X_struct = structured_pipeline.transform(structured_features)
#     X_text = text_pipeline.transform(text_data)

#     logger.info("Combining structured + text features...")
#     X_final = hstack([X_struct, X_text])

#     X_final_df = pd.DataFrame(X_final.toarray())
#     X_final_df.to_csv(output_clean, index=False)
#     logger.info("Clean data saved to %s", output_clean)

#     dump({'structured_pipeline': structured_pipeline,
#           'text_pipeline': text_pipeline}, pipeline_path)
#     logger.info("Pipeline saved to %s", pipeline_path)

#     logger.info("✅ Preprocessing completed successfully!")
#     return X_final_df

In [70]:
def run_preprocessing(input_path='../data/raw/revenue_data.csv',
                      output_clean='../data/processed/pre_processed_data.csv',
                      pipeline_path='../models/preprocessing_pipeline.pkl'):
    logger.info("Starting preprocessing workflow...")

    df = load_data(input_path)
    df['line_item_description'] = df['line_item_description'].fillna("")

    structured_features = df.drop(columns=['line_item_description'])
    text_data = df['line_item_description']

    structured_pipeline, text_pipeline, text_col = build_pipeline()

    logger.info("Fitting structured pipeline...")
    structured_pipeline.fit(structured_features)

    # ✅ Fit text pipeline with Series
    y_dummy = np.random.randint(0, 2, size=len(df))
    text_pipeline.fit(text_data, y_dummy)

    # ✅ Transform both parts
    X_struct = structured_pipeline.transform(structured_features)
    X_text = text_pipeline.transform(text_data)

    logger.info("Combining structured + text features...")
    X_final = hstack([X_struct, X_text])

    # ✅ Generate column names
    column_names = _get_feature_names(structured_pipeline, text_pipeline)
    
    X_final_df = pd.DataFrame(X_final.toarray(), columns=column_names)
    X_final_df.to_csv(output_clean, index=False)
    logger.info("Clean data saved to %s", output_clean)

    dump({'structured_pipeline': structured_pipeline,
          'text_pipeline': text_pipeline}, pipeline_path)
    logger.info("Pipeline saved to %s", pipeline_path)

    logger.info("✅ Preprocessing completed successfully!")
    return X_final_df


def _get_feature_names(structured_pipeline, text_pipeline):
    """Extract feature names from fitted pipelines"""
    
    # Get structured feature names
    structured_names = []
    
    # Numeric features (after PCA)
    n_pca_components = structured_pipeline.named_transformers_['num'].named_steps['pca'].n_components_
    structured_names.extend([f'pca_numeric_{i}' for i in range(n_pca_components)])
    
    # Categorical features (one-hot encoded)
    try:
        cat_encoder = structured_pipeline.named_transformers_['cat'].named_steps['encoder']
        cat_feature_names = cat_encoder.get_feature_names_out(['region', 'product_category'])
        structured_names.extend(cat_feature_names)
    except AttributeError:
        # Fallback if get_feature_names_out not available
        cat_encoder = structured_pipeline.named_transformers_['cat'].named_steps['encoder']
        n_cat_features = len(cat_encoder.get_feature_names(['region', 'product_category']))
        structured_names.extend([f'cat_{i}' for i in range(n_cat_features)])
    
    # Get text feature names (after SelectKBest)
    tfidf = text_pipeline.named_steps['tfidf']
    selector = text_pipeline.named_steps['selector']
    
    # Get all TF-IDF feature names
    all_tfidf_names = tfidf.get_feature_names_out()
    
    # Get selected indices from SelectKBest
    selected_indices = selector.get_support(indices=True)
    text_names = [f'text_{all_tfidf_names[i]}' for i in selected_indices]
    
    # Combine all names
    all_names = structured_names + text_names
    
    logger.info(f"Generated {len(all_names)} feature names: "
                f"{len(structured_names)} structured + {len(text_names)} text")
    
    return all_names

In [71]:
clean_data = run_preprocessing()
clean_data.head()

2025-10-31 11:22:42,004 | INFO | Starting preprocessing workflow...
2025-10-31 11:22:42,005 | INFO | Loading dataset from ../data/raw/revenue_data.csv
2025-10-31 11:22:42,028 | INFO | Initial shape: (21600, 10)
2025-10-31 11:22:42,031 | INFO | Building preprocessing + PCA + feature selection pipeline...
2025-10-31 11:22:42,032 | INFO | Fitting structured pipeline...
2025-10-31 11:22:42,173 | INFO | Combining structured + text features...
2025-10-31 11:22:42,180 | INFO | Generated 26 feature names: 10 structured + 16 text
2025-10-31 11:22:42,432 | INFO | Clean data saved to ../data/processed/pre_processed_data.csv
2025-10-31 11:22:42,436 | INFO | Pipeline saved to ../models/preprocessing_pipeline.pkl
--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\vinna\anaconda3\Lib\logging\__init__.py", line 1163, in emit
    stream.write(msg + self.terminator)
  File "C:\Users\vinna\anaconda3\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input

Unnamed: 0,pca_numeric_0,pca_numeric_1,region_APAC,region_EMEA,region_LATAM,region_North America,product_category_Hardware Sales,product_category_Maintenance,product_category_Professional Services,product_category_Software Subscription,...,text_legacy,text_license,text_monthly,text_premium,text_product,text_project,text_server,text_standard,text_support,text_unit
0,-2.3412,-1.001922,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-2.274848,-0.942134,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.945129,-0.645035,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.625025,0.0,0.0,0.0,0.0,0.625025,0.0,0.0
3,-2.266986,-0.93505,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.57735,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.57735,0.0
4,-1.844408,-0.554278,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.57735,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.57735,0.0


In [72]:
clean_data.columns

Index(['pca_numeric_0', 'pca_numeric_1', 'region_APAC', 'region_EMEA',
       'region_LATAM', 'region_North America',
       'product_category_Hardware Sales', 'product_category_Maintenance',
       'product_category_Professional Services',
       'product_category_Software Subscription', 'text_alpha', 'text_annual',
       'text_cloud', 'text_consulting', 'text_fee', 'text_installation',
       'text_legacy', 'text_license', 'text_monthly', 'text_premium',
       'text_product', 'text_project', 'text_server', 'text_standard',
       'text_support', 'text_unit'],
      dtype='object')

In [73]:
clean_data.head(10)

Unnamed: 0,pca_numeric_0,pca_numeric_1,region_APAC,region_EMEA,region_LATAM,region_North America,product_category_Hardware Sales,product_category_Maintenance,product_category_Professional Services,product_category_Software Subscription,...,text_legacy,text_license,text_monthly,text_premium,text_product,text_project,text_server,text_standard,text_support,text_unit
0,-2.3412,-1.001922,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-2.274848,-0.942134,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.945129,-0.645035,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.625025,0.0,0.0,0.0,0.0,0.625025,0.0,0.0
3,-2.266986,-0.93505,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.57735,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.57735,0.0
4,-1.844408,-0.554278,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.57735,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.57735,0.0
5,-1.998156,-0.692816,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.57735
6,-2.02252,-0.714769,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.57735,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0
7,-2.181307,-0.857847,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.625025,0.0,0.0,0.0,0.0,0.625025,0.0,0.0
8,-2.314964,-0.978281,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.625025,0.0,0.0,0.0,0.0,0.625025,0.0,0.0
9,-1.394905,-0.149244,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.57735,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0


In [74]:
# 📊 Step 7: Quick summary of transformed data
logger.info("Final processed shape: %s", clean_data.shape)
clean_data.describe().T.head(10)

2025-10-31 11:22:42,511 | INFO | Final processed shape: (21600, 26)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pca_numeric_0,21600.0,-8.421247e-17,1.456282,-2.88169,-1.237667,-0.069569,1.119468,3.509112
pca_numeric_1,21600.0,0.0,1.01054,-2.040207,-0.966027,0.012162,0.961539,2.38017
region_APAC,21600.0,0.2452315,0.430234,0.0,0.0,0.0,0.0,1.0
region_EMEA,21600.0,0.2503241,0.43321,0.0,0.0,0.0,1.0,1.0
region_LATAM,21600.0,0.2509259,0.433556,0.0,0.0,0.0,1.0,1.0
region_North America,21600.0,0.2535185,0.435035,0.0,0.0,0.0,1.0,1.0
product_category_Hardware Sales,21600.0,0.254537,0.435611,0.0,0.0,0.0,1.0,1.0
product_category_Maintenance,21600.0,0.2510185,0.433609,0.0,0.0,0.0,1.0,1.0
product_category_Professional Services,21600.0,0.2493519,0.432648,0.0,0.0,0.0,0.0,1.0
product_category_Software Subscription,21600.0,0.2450926,0.430152,0.0,0.0,0.0,0.0,1.0
