# Feature Extraction using Unsupervised Methods

This notebook augments the X_train and X_test dataframes with additional features from PCA extraction, MDS, and K-Means clustering techniques.

Additionally, it preserves all objects as python 'pickle' files so that results can be replicated in the future.

In [1]:
#123456789012345678901234567890123456789012345678901234567890123456789012345678

In [2]:
import sys
PATH_TO_MODULES = '../../..'
sys.path.insert(0, PATH_TO_MODULES)

from helpers import *
from unsupervised_helpers import *

In [3]:


def extract_features(input_df: pd.DataFrame, pickle_path='pickles/') -> pd.DataFrame:
    """Extract features from a training dataset using PCA.

    Use this function only on the TRAINING set.
    This function extracts features using PCA and assigns cluster labels
    using K-Means.  It also saves an augmented dataframe as a CSV file, and 
    preserves the objects used to transform the data as 'pickle' files.
    """
    dataset = input_df.copy()
    if 'Unnamed: 0' in dataset.columns:
        dataset.drop(columns=['Unnamed: 0'], inplace=True)
    # Process for PCA
    pca_df = process_for_PCA(dataset)
    # We found that the top 50 PCs explain 85% of the variance
    pca = PCA(50)
    X_PCA = pca.fit_transform(pca_df)
    # Pickle the pca object
    with open(pickle_path + 'pca_all.pickle', 'wb') as f:
        pickle.dump(pca, f)
    # Make a dataframe of the top 50 PCs
    PCA_cols = ['PCA_all_PC' + str(i + 1) for i in range(X_PCA.shape[1])]
    top50_PC_all_df = pd.DataFrame(X_PCA, columns=PCA_cols)
    # Perform PCA using only the KPI subset
    subset_df = get_KPI(dataset)
    subset_pca_df = process_for_PCA(subset_df)
    pca_KPI = PCA(10)
    X_PCA_KPI = pca_KPI.fit_transform(pca_df)
    # pickle the pca_KPI object
    with open(pickle_path + 'pca_KPI.pickle', 'wb') as f:
        pickle.dump(pca_KPI, f)
    # Make a dataframe of the top 10 PCs from the KPI subset
    PCA_KPI_cols = ['PCA_KPI_PC' + str(i + 1) for i in range(X_PCA_KPI.shape[1])]
    top10_PC_KPI_df = pd.DataFrame(X_PCA_KPI, columns=PCA_KPI_cols)
    # Use K-Means to assign cluster labels
    kmeans = KMeans(n_clusters=7, init='random', n_init=100, copy_x=False)
    cluster = kmeans.fit_predict(X_PCA_KPI)
    # Pickle the K-Means object
    with open(pickle_path + 'kmeans.pickle', 'wb') as f:
        pickle.dump(kmeans, f)
    # Create augmented DataFrame
    augmented_df = pd.concat(
        (
            dataset,
            top50_PC_all_df,
            top10_PC_KPI_df,
            pd.Series(cluster, name='Cluster')
        ), axis = 1
    )
    return augmented_df

    
def transform_features(input_df: pd.DataFrame) -> pd.DataFrame:
    """Augment a dataframe with extracted features.

    Use this function on the test set.
    This function uses pickled objects that were used during feature extraction
    on the test set to augment the test set with additional features.
    """
    dataset = input_df.copy()
    if 'Unnamed: 0' in dataset.columns:
        dataset.drop(columns=['Unnamed: 0'], inplace=True)
    # Need to finish this....
    return dataset
    
    


## Load data


In [4]:
# Import merged dataset
path = PATH_TO_MODULES + '/datasets/'
input_filename = 'X_train_filled_KPIs_QoQ.csv'
dataset = pd.read_csv(path + input_filename)
print(f'There are {dataset.shape[0]} rows and {dataset.shape[1]} columns in the dataset.')

There are 1910 rows and 265 columns in the dataset.


In [6]:
pickle_path = PATH_TO_MODULES + '/pickles/'
augmented_df = extract_features(dataset, pickle_path=pickle_path)
augmented_df

Unnamed: 0,Ticker,Name,Sector,CapitalExpenditure_2024Q2,CapitalExpenditure_2024Q3,CapitalExpenditure_2024Q4,CapitalExpenditure_2025Q1,CashAndSTInvestments_2024Q2,CashAndSTInvestments_2024Q3,CashAndSTInvestments_2024Q4,...,PCA_KPI_PC2,PCA_KPI_PC3,PCA_KPI_PC4,PCA_KPI_PC5,PCA_KPI_PC6,PCA_KPI_PC7,PCA_KPI_PC8,PCA_KPI_PC9,PCA_KPI_PC10,Cluster
0,ACIW,ACI WORLDWIDE INC,Information Technology,-3777000.0,-4045500.0,-4663500.0,-4112000.0,1.851080e+08,1.970750e+08,1.883640e+08,...,-0.123216,0.554203,0.635451,1.822323,0.574676,-0.658164,0.246607,0.794238,0.242110,1
1,HONE,HARBORONE BANCORP INC,Financials,-220000.0,-569000.0,-339000.0,-208000.0,2.350620e+08,2.242790e+08,2.310710e+08,...,1.653149,-1.471926,-0.344523,1.129446,-1.540118,0.719991,-0.793320,0.414424,-0.311225,6
2,REPL,REPLIMUNE GROUP INC,Health Care,-1618000.0,-1114000.0,-2266000.0,-1503000.0,8.278500e+07,7.524700e+07,7.830350e+07,...,-0.800500,0.397646,-0.933726,0.954682,-0.089657,0.739249,-0.597600,-0.967186,0.458068,0
3,RBRK,RUBRIK INC CLASS A,Information Technology,-15766000.0,-4929000.0,-7527000.0,-8401000.0,4.293020e+08,1.423490e+08,1.038960e+08,...,-1.510564,-0.243129,-2.950378,-0.545907,0.209089,0.176353,0.822190,-0.061928,0.233287,0
4,CSL,CARLISLE COMPANIES INC,Industrials,-24900000.0,-19300000.0,-36600000.0,-29000000.0,1.736300e+09,1.530600e+09,7.535000e+08,...,-0.909721,0.196802,-0.543445,-1.068914,-0.480607,-1.400929,-0.783283,0.340253,-1.385225,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1905,CNS,COHEN & STEERS INC,Financials,-4239000.0,-1408000.0,-1678000.0,-1075000.0,1.220130e+08,1.064740e+08,1.829740e+08,...,0.866749,2.232555,0.594781,0.277588,-0.506149,-0.634160,0.468937,0.529396,-1.060750,1
1906,FBP,FIRST BANCORP,Financials,-3264000.0,-2547000.0,-3407000.0,-3396000.0,4.240230e+08,5.029450e+08,5.027000e+08,...,1.570123,-0.617818,-0.528226,-0.326411,-1.240525,-0.932261,-0.447994,-0.311293,0.681022,6
1907,RDDT,REDDIT INC CLASS A,Communication,-1202000.0,-1353000.0,-842000.0,-979000.0,4.679520e+08,5.158950e+08,5.620920e+08,...,0.095268,2.070966,-0.487123,1.674875,0.537885,-0.467444,-0.065256,-0.251066,0.669931,1
1908,AGM,FEDERAL AGRICULTURAL MORTGAGE NON,Financials,-3568000.0,-66000.0,-3407000.0,-3396000.0,9.229610e+08,8.420560e+08,1.007817e+09,...,1.508701,-1.586222,0.096204,1.063445,-0.604190,0.619200,0.692641,0.058824,0.004985,6
