In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA

## Exploring Train and Test CSVs
- Check column names
- Get dataset stats
    - Num rows
    - num unique UIDs
    - num columns
    - overlap between train and test
- Understand structure of data
    - Look at column dtypes
    - Pull samples from each column

## Results
Train features 
- 4k rows
- 0 nulls
- 3 cols: all object dtype
    - uid: 4 characters; alphabetic; all unique
    - NarrativeLE: text string w/ typing errors ie. "vehicle.The..." Seems to have special coding ie. "V was XXXX." and "V (XX XX)" etc.
        - Seems like V is a placeholder for subject and XX gives identifying info on subject
    - NarrativeCME: from 1st 5 rows seems to have a lot of the same as NarrativeLE. There are differences ie. in mistakes (row 2 in CME there is spacing error)
    
Train labels:
- 0 Nulls
- 24 cols; 1 is uid; all unique
- All cols are int64 except uid

Train features and labels have 100% overlap in uids
All Test uids are in train as well

Viewing 2 train/test overlapping samples
- No numerics, gonna have to train to map text -> numerals.
- CME was shorter, seems more succint
- LE has more context, but not sure if it is entirely relevant to data extraction. Will need to test on either and both

In [2]:
# set up file paths
DIR = os.getcwd()
TRAIN_FEATURES_FILE = "train_features_X4juyT6.csv"
TRAIN_LABELS_FILE = "train_labels_JxtENGl.csv"
TEST_FEATURES_FILE = "smoke_test_features_bWOfr2M.csv"
TEST_LABELS_FILE = "smoke_test_labels_waBGl8d.csv"

TRAIN_FEATURES_PATH = DIR + f"/data/{TRAIN_FEATURES_FILE}"
TRAIN_LABELS_PATH = DIR + f"/data/{TRAIN_LABELS_FILE}"
TEST_FEATURES_PATH = DIR + f"/data/{TEST_FEATURES_FILE}"
TEST_LABELS_PATH = DIR + f"/data/{TEST_LABELS_FILE}"

# encoded paths
#DBERTA_CME_PATH = DIR + "/data" + "/cme_deberta_enc.npy"
GTE_CME_PATH = DIR + "/data" + "/cme_gte_enc.npy"
GTE_LE_PATH = DIR + "/data" + "/le_gte_enc.npy"

In [3]:
gte_cme_features = np.load(GTE_CME_PATH)
gte_le_features = np.load(GTE_LE_PATH)

# PCA GTE Encoded Features

### Using Standard Scaler

In [4]:
# scale data
cme_scaler = StandardScaler().fit(gte_cme_features)
gte_cme_features_scaled = cme_scaler.transform(gte_cme_features)

le_scaler = StandardScaler().fit(gte_le_features)
gte_le_features_scaled = le_scaler.transform(gte_le_features)

print(f"Max LE Number: {np.max(gte_le_features_scaled)}")
print(f"Min LE Number: {np.min(gte_le_features_scaled)}")

Max LE Number: 6.439421653747559
Min LE Number: -6.21668815612793


In [5]:
# CME PCA on all vars
pca_cme = PCA()
pca_cme.fit(gte_cme_features_scaled)

In [6]:
cusum_expl_var_ratio = np.cumsum(pca_cme.explained_variance_ratio_)
len(cusum_expl_var_ratio[cusum_expl_var_ratio <= 0.9])

164

In [10]:
pca_le = PCA()
pca_le.fit(gte_le_features_scaled)

In [12]:
print(np.cumsum(pca_le.explained_variance_ratio_)[:300])

[0.07703985 0.14226392 0.19348478 0.23159948 0.26557723 0.29286394
 0.31784263 0.33965945 0.3602879  0.37842125 0.39530408 0.4110196
 0.4265701  0.4407542  0.45470804 0.46793327 0.48003593 0.49106097
 0.5014055  0.51141614 0.52085793 0.5298616  0.5384631  0.5467384
 0.5545423  0.5622427  0.5696931  0.5770363  0.5840555  0.59060794
 0.5971023  0.6034171  0.6096225  0.6157158  0.6214775  0.627016
 0.6323965  0.6377279  0.6428887  0.64785933 0.652739   0.6575843
 0.66232294 0.6670091  0.67150766 0.67592597 0.6802699  0.68455404
 0.6887583  0.69279695 0.69673157 0.7005892  0.7043534  0.7080489
 0.7116634  0.7152002  0.7186744  0.7220768  0.72536373 0.72858274
 0.73171544 0.7347594  0.7377671  0.740697   0.743604   0.7464491
 0.74926466 0.7520248  0.75476366 0.7574758  0.76013297 0.76270115
 0.7652501  0.7677554  0.77020526 0.77263534 0.77501076 0.77733123
 0.77964205 0.78191984 0.7841916  0.7863526  0.78849983 0.79061
 0.7926786  0.7947364  0.7967787  0.79875404 0.80071664 0.8026626
 0.804

### Min Max Scaler

In [29]:
def examine_df(df: pd.DataFrame) -> None:

    print(f"DF info: \n{df.info()}")
    print(f"Sample of first 5 rows: \n{df.head(5)}")

def dataframe_stats(df: pd.DataFrame) -> None:
    
    print(f"{df['uid'].nunique()} unique uid")
    

In [15]:
# load data into pandas dfs
train_features_df = pd.read_csv(TRAIN_FEATURES_PATH)
train_labels_df = pd.read_csv(TRAIN_LABELS_PATH)

# df info and first 5 rows
examine_df(train_features_df)
examine_df(train_labels_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   uid           4000 non-null   object
 1   NarrativeLE   4000 non-null   object
 2   NarrativeCME  4000 non-null   object
dtypes: object(3)
memory usage: 93.9+ KB
DF info: 
None
Sample of first 5 rows: 
    uid                                        NarrativeLE  \
0  aaaf  V (XX XX) shot himself in a motor vehicle.The ...   
1  aaby  V was XXXX. V was found in the basement of his...   
2  aacl  V was XXXX. V was found in his residence unres...   
3  aacn  The victim, a XX XX who had recently returned ...   
4  aadb  XX XX V found deceased at home by his grandpar...   

                                        NarrativeCME  
0  V (XX XX) shot himself in a motor vehicle.The ...  
1  V was XXXX.  V was found in the basement of hi...  
2  V was XXXX. V was found in his residence suffe...  
3  On the 

In [30]:
dataframe_stats(train_features_df)
dataframe_stats(train_labels_df)

print(f"Train features and labels have all same uids: {train_features_df['uid'].equals(train_labels_df['uid'])}")

4000 unique uid
4000 unique uid
Train features and labels have all same uids: True


In [32]:
# test set
test_features_df = pd.read_csv(TEST_FEATURES_PATH)
test_labels_df = pd.read_csv(TEST_LABELS_PATH)

examine_df(test_features_df)
examine_df(test_labels_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   uid           25 non-null     object
 1   NarrativeLE   25 non-null     object
 2   NarrativeCME  25 non-null     object
dtypes: object(3)
memory usage: 728.0+ bytes
DF info: 
None
Sample of first 5 rows: 
    uid                                        NarrativeLE  \
0  ajrd  V was XXXX. V was found in an oak tree in a wo...   
1  aomw  The V is a XX XX who died as a result of Hangi...   
2  asgb  The XX XX victim had an intentional, self-infl...   
3  aueg  LE summary:V Demographics data is consistent w...   
4  bdji  A XX XX (V) was found unresponsive with a GSW ...   

                                        NarrativeCME  
0  V was XXXX.  V was found inside his residence ...  
1  This XX XX V died as a result of Hanging.  The...  
2  The victim was a XX XX with an intentional, se...  
3  XX YO, 

In [51]:
dataframe_stats(test_features_df)
dataframe_stats(test_labels_df)

print(f"Train features and labels have all same uids: {test_features_df['uid'].equals(test_labels_df['uid'])}")
print(f"Train and test have overlapping uids: \
{len(set(test_features_df['uid']) & set(test_labels_df['uid'])) != 0}")

25 unique uid
25 unique uid
Train features and labels have all same uids: True
Train and test have overlapping uids: True


In [46]:
train_features_df[train_features_df['uid'] == 'ajrd']['NarrativeLE'].values

array(["V was XXXX. V was found in an oak tree in a wooded area on his parent's property deceased from an apparent gunshot wound to the head, which was suicidal in nature. EMS was called and pronounced V dead at the scene. According to V's father, V had been depressed and very upset about his life recently. V had left college and had quit his job. V had been staying with his grandparents prior to moving back with his parents two days prior. V had an argument with his grandfather and told his father that he wanted to move back home. V's father stated that he and V got into it a little bit about V not giving a two week notice at his job and should have thought about the situation before just quitting. V's mother stated that V may have been depressed due to not being able to play baseball at school because of a torn shoulder cuff. V left a suicide note saying that he was sorry that he was too weak to live in this world, that he loved them and that they could find him in his favorite tree.

In [45]:
test_features_df[test_features_df['uid'] == 'ajrd']['NarrativeCME'].values

array(["V was XXXX.  V was found inside his residence with a  gunshot wound to the head in an apparent suicide.  No circumstances leading up to the fatal injury were provided in this report.  Per the report V was never satisfied with his accomplishments though did exceedingly well in school and sports.  V abruptly quit his job that was 2 hours from his residence and stated he was coming home to his parents.  The next day V's parents left and V shot himself.  Toxicology results were not provided in this report.  No further information regarding circumstance."],
      dtype=object)

In [48]:
train_features_df[train_features_df['uid'] == 'aomw']['NarrativeLE'].values

array(['The V is a XX XX who died as a result of Hanging. The manner of death was Suicide. The V had been dealing with depression and anxiety issues but was not medicated, but was seeing a counselor. The V came home from hanging out with friends and went downstairs. About 20 minutes later, his mother went to say goodnight and found the V hanging from a belt from a pull-up station. Emergency Medical Services responded and pronounced the V. The V had previously struggled with cutting himself, but no current problem was mentioned. The V\'s father reported that he believed the V was "cyber-bullied" into taking his life. The V and his ex-girlfriend had been broken up for a couple of weeks and she reported there were numerous occasions where the V told her he was extremely suicidal. The V had also been struggling with relationship issues with other classmates. Upon interviewing his peers, they reported that the V had threatened them and others and posted inappropriate comments about his ex-g

In [49]:
test_features_df[test_features_df['uid'] == 'aomw']['NarrativeCME'].values

array(['This XX XX V died as a result of Hanging.  The manner was Suicide.  The V had been dealing with depression and anxiety issues but was not medicated.  It is not known if he had been diagnosed.  He had also been having problems with his girlfriend and sometime prior to the day of the incident had written on a shelf statements of worthlessness.  He was last known to be alive an hour prior when he returned home and was seen by his mother before he went downstairs to his bedroom.  The mother went down to check on the V and found him unresponsive hanging from exercise equipment by a belt.  Medical personnel responded but were not able to revive the V and pronounced him at the scene.  No notes were found, no other circumstances are known.'],
      dtype=object)