# Notebook 2: Exploratory Analysis

This notebook conducts an exploratory analysis of the dataset under examination. While the notebook section 2.1 assesses the class distribution of the preliminary labeled dataset, 2.2 evaluates the distribution of document lengths as approximated by the number of tokens. Finally, the processed data from notebook #1 is transformed with the TF-IDF 1 method (thesis section 3.3.2) and projected on two dimensions with a PCA (thesis section 3.4.1) and LDA (thesis section 3.4.2) to assess the distribution of documents and the extent of overlapping classes.

The results are presented in the thesis section 4.1.

Table of Contents:
* [2.1 Initial class distribution of labeled document corpus](#dist)
* [2.2 Analysis of varying document lengths](#lengths)
* [2.3 Two-dimensional projection of labeled dataset](#2d)
    * [2.3.1 PCA](#pca)
    * [2.3.2 LDA](#lda)

In [None]:
# loading modules

from collections import Counter
import matplotlib.pyplot as plt
import pickle
import sys
import numpy as np
import seaborn as sns
from sklearn.decomposition import IncrementalPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# loading datasets

# load data (labeled_X)
# load labels (labeled_y)
# load data_unlabeled (unlabeled_X)

text_labels = ['Agreement', 'Amendment', 'Attachment', 'LOI', 'NDA', 'Offer', 'SOW']

# define minimum frequency of term in analysis to be included in vectorization
def min_df(data, percentage):
    min_df_c = len(data) * percentage
    return int(round(min_df_c))

# 2.1 Initial class distribution of labeled document corpus <a id="dist"></a>

In [None]:
c = dict(sorted(Counter(labels).items())) #aggregate classes
c.update({k: v/len(labels) for k, v in c.items()}) #transform absolute in relative values
for c, p in percentages.items():
    print(text_labels[c], round(p * 100, 1), '%') # print relative share of document type in labeled document corpus

# 2.2 Analysis of varying document lengths <a id="lengths"></a>

In [None]:
#distribution of number of tokens per document
counts = list()
for i in range(len(data)): #include lengths of all labeled documents
    counts.append(len(data.iloc[i,0]))
for i in range(len(data_unlabeled)): #include lengths of all unlabeled documents
    counts.append(len(data_unlabeled.iloc[i,0]))
    
plt.hist(counts, 500, log = True) # visualize distribution with histogram

# 2.3 Two-dimensional projection of labeled dataset <a id="2d"></a>

In [None]:
#TF-Idf 1 representation of text
labeled_data = data['TEXT_PROCESSED'] #select text
unlabeled_data = data_unlabeled['TEXT_PROCESSED'] #select text
min_df_c = min_df(labeled_data, 0.05) #compute minimum frequency for term to be included
tfidf = TfidfVectorizer(analyzer= 'word', ngram_range = (1, 2), min_df = min_df_c, max_features = None, norm = 'l2', smooth_idf = True, sublinear_tf =True) # initiate uni- and bigram Tf-Idf vectorizer 
tfidf_vectorizer_best = tfidf.fit(labeled_data) #fit vectorizer on labeled dataset
labeled_tfidf = tfidf_vectorizer_best.transform(labeled_data) #transform labeled dataset
unlabled_tfidf = tfidf_vectorizer_best.transform(unlabeled_data) #transform unlabeled dataset
features = len(tfidf_vectorizer_best.get_feature_names()) #retrieve number of features

## 2.3.1 PCA <a id="pca"></a>

In [None]:
d = np.array(labeled_tfidf.todense())
chunk_size = 1000 # how many document are fed to IPCA at a time, the divisor of n
ipca = IncrementalPCA(n_components=2, whiten = False, batch_size=16) # principal component analysis to project data on two dimensions

for i in range(0, d.shape[0]//chunk_size): # split dataset into chunks to ensure better run time
    ipca.partial_fit(d[i*chunk_size : (i+1)*chunk_size]) # partially fitting the data on IPCA
    print(i) # ensure continuous running

# successive transformation of labeled documents according to the trained PCA
x_1 = ipca.transform(d[:3000])
x_2 = ipca.transform(d[3000:6000])
x_3 = ipca.transform(d[6000:9000])
x_4 = ipca.transform(d[9000:12000])
x_5 = ipca.transform(d[12000:15000])
x_6 = ipca.transform(d[15000:18000])
x_7 = ipca.transform(d[18000:21000])
x_8 = ipca.transform(d[21000:])
data_pca = np.concatenate((x_1, x_2, x_3, x_4, x_5, x_6, x_7, x_8)) # concatenating all documents to one dataset

labels_t = [text_labels[i] for i in labels] # retrieve text labels for the legend

# plot distribution of documents
plt.figure(figsize = (16,10))
with sns.color_palette("husl", 7):
    sns.scatterplot(
    x = data_pca[:, 0], y= data_pca[:, 1],
    hue = labels_t,
    legend = 'full')
    
plt.legend(prop={'size': 16}, markerscale = 2.2)
plt.xlabel('Principal Component 1', fontsize=16)
plt.ylabel('Principal Component 2', fontsize=16)

## 2.3.2 LDA <a id="lda"></a>

In [None]:
lda = LDA(n_components = 2).fit(labeled_tfidf.todense(), labels) # linear discriminant analysis to project data on two dimensions
data_lda = lda.transform(labeled_tfidf.todense())

labels_t = [text_labels[i] for i in labels] # retrieve text labels for the legend

# plot distribution of documents
plt.figure(figsize = (16,10))
with sns.color_palette("husl", 7):
    sns.scatterplot(
    x = data_lda[:, 0], y= data_lda[:, 1],
    hue = labels_t,
    legend = 'full')
    
plt.legend(prop={'size': 16}, markerscale = 2.2)
plt.xlabel('Component 1', fontsize=16)
plt.ylabel('Component 2', fontsize=16)