In [4]:
import torch
from transformers import BertTokenizer, BertModel
import logging
import pandas as pd
import numpy as np
import random
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline

## Loding data

In [None]:
notes = pd.read_csv('notes.csv')[['id', 'text']]
print(f'shape of the notes file: {notes.shape}')
n_sub = len(pd.unique(notes['id']))
print(f'unique number of subjects: {n_sub}')
notes.head()

shape of the notes file: (81206, 2)
unique number of subjects: 20414


Unnamed: 0,id,text
0,25696644,HISTORY: Altered mental status.\n\nTECHNIQUE:...
1,26048429,"INDICATION: Esophageal carcinoma, status post..."
2,26048429,HISTORY: Postop day one interval change.\n\nC...
3,20214994,EXAMINATION: CHEST (PRE-OP PA AND LAT)\n\nIND...
4,20214994,EXAMINATION: ABDOMINAL RADIOGRAPHS\n\nINDICAT...


In [None]:
# distribution of how many notes each subject has
notes['id'].value_counts().describe()

count    20414.000000
mean         3.977956
std          3.050836
min          1.000000
25%          2.000000
50%          3.000000
75%          5.000000
max         57.000000
Name: count, dtype: float64

In [None]:
notes['text'] = notes['text'].str.replace("___", "")
notes['text'] = notes['text'].str.lower()
notes.head()

Unnamed: 0,id,text
0,25696644,history: altered mental status.\n\ntechnique:...
1,26048429,"indication: esophageal carcinoma, status post..."
2,26048429,history: postop day one interval change.\n\nc...
3,20214994,examination: chest (pre-op pa and lat)\n\nind...
4,20214994,examination: abdominal radiographs\n\nindicat...


### loading essential functions

In [None]:
def token_distribution(df, model_name):
  # load tokenizer
  tokenizer = BertTokenizer.from_pretrained(model_name)
  tokenized = df['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

  # investigate token
  rand_id = random.randint(0, len(tokenized)-1)
  print(f'random id: {rand_id}')
  print(df['text'][rand_id])
  print(tokenizer.convert_ids_to_tokens(tokenized[rand_id]))
  print(f'length of the token: {len(tokenized[rand_id])}')


  # visualize token distribution
  max_len = 0
  n = 0
  len_ls = tokenized.apply(lambda x: len(x))
  for i in tokenized.values:
    if len(i) > 512:
      n+=1
    if len(i) > max_len:
        max_len = len(i)
  print(f'maximum lenght is {max_len}')
  print(f'{n} out of {df.shape[0]} ({n/df.shape[0]*100} %) notes exceed the maximum embedding length of 512')

  print(len_ls.describe())
  ax = len_ls.plot.box()
  plt.axhline(y=512, color='r', linestyle='--')
  plt.show()


def obtain_batch_embedding(df, model_name):

  # load tokenizer
  tokenizer = BertTokenizer.from_pretrained(model_name)
  tokenizer.truncation_side='left' # truncate from left to preserve more findings

  # obtain token id and attention mask
  input_ids = []
  attention_masks = []
  for t in df['text']:
    encoded_dict = tokenizer.encode_plus(t,
                                        add_special_tokens=True,
                                        max_length=512,
                                        padding='max_length',
                                        return_attention_mask=True,
                                        return_tensors='pt',
                                        truncation=True)
    input_ids.append(encoded_dict['input_ids'].to(device))
    attention_masks.append(encoded_dict['attention_mask'].to(device))

  # convert to tensor
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  print("tokens obtained, start fitting model")

  # fit model
  model = BertModel.from_pretrained(model_name)
  model = model.to(device)
  with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_masks)

  features = pd.DataFrame(last_hidden_states[0][:,0,:].cpu().numpy())
  features.reset_index(drop=True, inplace=True)
  # print(df['id'].shape, features.shape)

  # concatenate the id column
  res = pd.concat([df['id'].reset_index(drop=True), features], axis=1, ignore_index=True)
  print(res.shape)

  # take mean value of the embeddings for each individual
  # res = features.groupby('id').mean()

  return res

def embedding(df, model_name, batch_size=256):

  # loop through batches
  embeddings = pd.DataFrame()
  for i in range(0, df.shape[0], batch_size):
    print(f'batch from {i}th to {i+batch_size}th sample')
    batch = df.iloc[i:i+batch_size]
    batch_embedding = obtain_batch_embedding(batch, model_name)
    embeddings = pd.concat([embeddings, batch_embedding], axis=0)

  return embeddings


In [None]:
notes['id'].reset_index(drop=True).head()

0    25696644
1    26048429
2    26048429
3    20214994
4    20214994
Name: id, dtype: int64

### set up device

In [None]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cuda:0'

### generating embedding

In [None]:
# to investigate the distribution of the tokens
token_distribution(notes, "neuml/pubmedbert-base-embeddings")

In [None]:
# obtain embeddings
res = embedding(notes, "neuml/pubmedbert-base-embeddings")
print(res.shape)
res.head()

batch from 0th to 256th sample
tokens obtained, start fitting model
(256, 769)
batch from 256th to 512th sample
tokens obtained, start fitting model
(256, 769)
batch from 512th to 768th sample
tokens obtained, start fitting model
(256, 769)
batch from 768th to 1024th sample
tokens obtained, start fitting model
(256, 769)
batch from 1024th to 1280th sample
tokens obtained, start fitting model
(256, 769)
batch from 1280th to 1536th sample
tokens obtained, start fitting model
(256, 769)
batch from 1536th to 1792th sample
tokens obtained, start fitting model
(256, 769)
batch from 1792th to 2048th sample
tokens obtained, start fitting model
(256, 769)
batch from 2048th to 2304th sample
tokens obtained, start fitting model
(256, 769)
batch from 2304th to 2560th sample
tokens obtained, start fitting model
(256, 769)
batch from 2560th to 2816th sample
tokens obtained, start fitting model
(256, 769)
batch from 2816th to 3072th sample
tokens obtained, start fitting model
(256, 769)
batch from 30

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,768
0,25696644,-0.961882,-0.149763,-0.030739,-1.021122,0.191997,-0.305727,-0.699893,-0.370058,-0.260747,...,-0.09094,0.739634,-0.700481,0.436043,1.198344,-0.129276,0.011377,0.181033,0.657025,-0.71831
1,26048429,-0.442894,-0.423292,-0.743249,-1.27186,0.189307,0.494722,-0.862139,-0.221709,-0.21498,...,-0.699226,0.001578,0.051244,-0.16088,0.49371,0.014612,0.500069,0.564776,0.546543,-0.730617
2,26048429,-0.424305,-0.159977,-0.158158,-0.804049,-0.107067,0.969735,-1.0707,-0.02891,-0.361005,...,-0.067264,0.025152,-0.046377,0.057703,0.960852,-0.09118,0.868535,-0.034059,1.105601,-0.274495
3,20214994,-0.16713,-0.38777,-0.255653,-0.966652,-0.246591,0.796385,-0.892034,0.365918,-0.349409,...,-0.563831,0.112659,-0.470008,0.359844,0.903242,-0.311563,0.60562,-0.036537,0.284726,-0.450746
4,20214994,-0.990296,0.236162,-0.891731,-0.060764,-0.043092,-0.045188,-1.281586,0.226228,0.261259,...,-0.357992,0.056466,-0.378226,0.651119,0.847899,0.221708,0.307551,-0.120801,0.4975,-0.653763


In [None]:
res.to_csv('pubmedbert_embedding.csv', index=False)

## Embedding aggregation
1. average
2. max pooling
3. randomly select one

In [None]:
embedding = pd.read_csv("./pubmedbert_embedding.csv")
embedding.rename(columns={'0': 'id'}, inplace=True)
print(embedding.shape)
embedding.head()

(81206, 769)


Unnamed: 0,id,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,768
0,25696644,-0.961882,-0.149763,-0.030739,-1.021122,0.191997,-0.305727,-0.699893,-0.370058,-0.260747,...,-0.09094,0.739634,-0.700481,0.436043,1.198344,-0.129276,0.011377,0.181033,0.657025,-0.71831
1,26048429,-0.442894,-0.423292,-0.743249,-1.27186,0.189307,0.494722,-0.862139,-0.221709,-0.21498,...,-0.699226,0.001578,0.051244,-0.16088,0.49371,0.014612,0.500069,0.564776,0.546543,-0.730617
2,26048429,-0.424305,-0.159977,-0.158158,-0.804049,-0.107067,0.969735,-1.0707,-0.02891,-0.361005,...,-0.067264,0.025152,-0.046377,0.057703,0.960852,-0.09118,0.868535,-0.034059,1.105601,-0.274495
3,20214994,-0.16713,-0.38777,-0.255653,-0.966652,-0.246591,0.796385,-0.892034,0.365918,-0.349409,...,-0.563831,0.112659,-0.470008,0.359844,0.903242,-0.311563,0.60562,-0.036537,0.284726,-0.450746
4,20214994,-0.990296,0.236162,-0.891731,-0.060764,-0.043092,-0.045188,-1.281586,0.226228,0.261259,...,-0.357992,0.056466,-0.378226,0.651119,0.847899,0.221708,0.307551,-0.120801,0.4975,-0.653763


In [None]:
test = embedding[20000:22000]
test.rename(columns={'0': 'id'}, inplace=True)
test.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.rename(columns={'0': 'id'}, inplace=True)


Unnamed: 0,id,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,768
20000,26870909,-0.538786,-0.033661,0.628227,-0.562071,-0.133268,-0.058549,-0.707114,0.423699,-0.77543,...,-0.355354,-0.520007,-0.145064,0.74914,0.420206,-0.729575,0.048278,-0.297783,0.272952,-0.711024
20001,27727321,-0.490337,0.126018,-0.787498,-0.433758,-0.107921,1.119397,-1.464724,-0.043529,-0.234906,...,-0.618939,-0.069264,-0.147243,0.215716,0.701113,-0.217276,0.324226,-0.045105,0.367587,-0.679906
20002,27727321,-0.776378,0.614439,0.0766,-0.12149,0.141488,-0.167427,-0.82751,0.726984,0.006872,...,-0.208094,0.37285,-0.380829,1.214288,0.292794,-0.963728,0.137193,-0.589385,0.067548,-1.59959
20003,27727321,-0.296081,0.250667,-0.121046,-0.068658,0.268956,0.670307,-0.822947,0.570372,0.135268,...,-0.544799,-0.744366,-0.346079,0.679209,0.245669,-0.741133,0.526469,0.254001,-0.079056,-0.542372
20004,29510747,-0.234447,-0.17607,-0.09211,-0.787402,0.35586,0.713764,-1.3758,-0.408523,-0.443587,...,-0.744142,-0.172968,-0.075356,-0.088719,1.486721,-0.237053,0.435958,0.07569,0.888237,0.037683


In [None]:
pd.DataFrame(test[test['id']==20214994].iloc[1]).T

Unnamed: 0,id,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,768
4,20214994.0,-0.990296,0.236162,-0.891731,-0.060764,-0.043092,-0.045188,-1.281586,0.226228,0.261259,...,-0.357992,0.056466,-0.378226,0.651119,0.847899,0.221708,0.307551,-0.120801,0.4975,-0.653763


In [None]:
pd.unique(test['id'])

array([25696644, 26048429, 20214994, 23559586, 24181354, 28128182,
       29988601, 22869003, 27939719, 23920883, 26488315, 25777141,
       22081550, 22987108, 22218665, 20291550, 23676183, 26359957,
       29842315, 22429197, 22216667])

In [None]:
def avg(df):
    res = df.groupby('id').mean().reset_index()
    return res

def max_pooling(df):
    res = df.groupby('id').max().reset_index()
    return res

def return_random_note(rows, rand_n):
    # rows = df[df['id']==id]
    # print(rows.shape, rand_n)
    res = pd.DataFrame(rows.iloc[rand_n]).T
    return res

def random_sample(df):
    # obtain count of notes for each id
    N = df.groupby('id').size().to_frame().reset_index()
    N.rename(columns={0: 'count'}, inplace=True)

    # generate a random number for each id
    N['rand'] = N.apply(lambda x: np.random.randint(0, x['count']), axis=1)

    # obtain random note for each id
    res = pd.DataFrame()
    count = 0
    id_ls = pd.unique(df['id'])
    res_ls = []
    for id in id_ls:
        rand_n = N[N['id']==id]['rand'].iloc[0]
        rows = df[df['id']==id]
        selected = return_random_note(rows, rand_n)
        # res = pd.concat([res, selected], ignore_index=True)
        res_ls.append(selected)

        count += 1
        if count % 500 == 0:
            print(f"{count} unique ids finished")

    res = pd.concat(res_ls)
    return res

In [None]:
pubmed_avg = avg(embedding)
print(pubmed_avg.shape)
# pubmed_avg.to_csv("pubmed_avg.csv", index=False)#

(20414, 769)


In [None]:
pubmed_max = max_pooling(embedding)
print(pubmed_max.shape)
# pubmed_max.to_csv("pubmed_max.csv", index=False)

(20414, 769)


In [None]:
del pubmed_avg
del pubmed_max

In [None]:
pd.unique(embedding['id']).shape

(20414,)

In [None]:
embedding.columns

Index(['id', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '759', '760', '761', '762', '763', '764', '765', '766', '767', '768'],
      dtype='object', length=769)

In [None]:
pubmed_rand = random_sample(embedding)
print(pubmed_rand.shape)
pubmed_rand.head()
# pubmed_rand.to_csv("pubmed_rand.csv", index=False)

500 unique ids finished
1000 unique ids finished
1500 unique ids finished
2000 unique ids finished
2500 unique ids finished
3000 unique ids finished
3500 unique ids finished
4000 unique ids finished
4500 unique ids finished
5000 unique ids finished
5500 unique ids finished
6000 unique ids finished
6500 unique ids finished
7000 unique ids finished
7500 unique ids finished
8000 unique ids finished
8500 unique ids finished
9000 unique ids finished
9500 unique ids finished
10000 unique ids finished
10500 unique ids finished
11000 unique ids finished
11500 unique ids finished
12000 unique ids finished
12500 unique ids finished
13000 unique ids finished
13500 unique ids finished
14000 unique ids finished
14500 unique ids finished
15000 unique ids finished
15500 unique ids finished
16000 unique ids finished
16500 unique ids finished
17000 unique ids finished
17500 unique ids finished
18000 unique ids finished
18500 unique ids finished
19000 unique ids finished
19500 unique ids finished
20000 

Unnamed: 0,id,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,768
0,25696644.0,-0.961882,-0.149763,-0.030739,-1.021122,0.191997,-0.305727,-0.699893,-0.370058,-0.260747,...,-0.09094,0.739634,-0.700481,0.436043,1.198344,-0.129276,0.011377,0.181033,0.657025,-0.71831
2,26048429.0,-0.424305,-0.159977,-0.158158,-0.804049,-0.107067,0.969735,-1.0707,-0.02891,-0.361005,...,-0.067264,0.025152,-0.046377,0.057703,0.960852,-0.09118,0.868535,-0.034059,1.105601,-0.274495
5,20214994.0,-0.447072,-0.479759,-0.389428,-0.755359,0.238575,1.226583,-1.005992,-0.461748,0.192283,...,-0.264748,0.254763,-0.15644,-0.025247,1.146273,-0.28628,0.136631,-0.222716,0.371108,-0.07618
10,23559586.0,-0.736017,0.23011,1.104697,-0.541268,-0.361138,-0.35682,-1.131791,0.329295,-0.190179,...,0.157699,-0.016856,-0.400899,0.286962,0.302196,-0.44993,0.583222,-0.040345,0.822514,-0.462158
17,24181354.0,-0.927706,0.178118,0.555045,-1.186871,-0.107297,-0.137812,-0.570643,-0.49785,-0.53474,...,-0.21095,0.260646,-0.443489,0.493528,0.777205,-0.347572,0.594686,-0.174461,0.775348,-0.33019


In [None]:
pubmed_rand.to_csv("pubmed_rand.csv", index=False)

## PCA for dimensionality reduction

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
# Read CSV files
outlier_id = pd.read_csv('/content/drive/MyDrive/outlier_id.csv')
pubmed_avg = pd.read_csv('/content/drive/MyDrive/pubmed_avg.csv')
pubmed_max = pd.read_csv('/content/drive/MyDrive/pubmed_max.csv')
pubmed_rand = pd.read_csv('/content/drive/MyDrive/pubmed_rand.csv')

# Remove outliers IDs in pubmed_avg
pubmed_avg = pubmed_avg[~pubmed_avg['id'].isin(outlier_id['id'])]
pubmed_max = pubmed_max[~pubmed_max['id'].isin(outlier_id['id'])]
pubmed_rand = pubmed_rand[~pubmed_rand['id'].isin(outlier_id['id'])]

# Save the modified pubmed_avg DataFrame to a new CSV file
pubmed_avg.to_csv('pubmed_avg_filtered.csv', index=False)
pubmed_max.to_csv('pubmed_max_filtered.csv', index=False)
pubmed_rand.to_csv('pubmed_rand_filtered.csv', index=False)

In [34]:
def pca(df, percent):
    pca = PCA(n_components=percent, svd_solver = 'full')
    pca.fit(df)
    df = pca.transform(df)
    print(df.shape)
    return df

In [29]:
# PCA - average
df = pd.read_csv('pubmed_avg_filtered.csv')
print(df.shape)
df.head()

(19199, 769)


Unnamed: 0,id,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,768
0,20001305,-0.075535,-0.468668,-0.31398,-0.526805,0.38238,0.688998,-0.675681,-0.031537,-0.162461,...,-0.406018,-0.267598,-0.352351,-0.016289,0.608691,-0.279628,0.498361,0.398085,0.060995,-0.490661
1,20001361,-0.584708,-0.174262,-0.04378,-0.659381,0.050867,0.312736,-0.821914,-0.263517,-0.015974,...,-0.335955,0.123488,-0.568291,0.401683,0.943098,-0.13242,0.237798,-0.110977,0.533044,-0.634867
2,20001770,-0.678607,-0.173897,-0.080683,-0.491075,0.278513,0.427995,-1.064977,0.163601,0.123127,...,-0.549967,-0.229505,-0.299411,0.358142,1.111076,-0.258609,0.544541,-0.278566,0.412698,-0.58723
3,20002506,-0.586007,0.360953,0.091846,-0.729991,-0.123258,-0.152887,-0.51921,0.068348,0.128391,...,0.079765,0.256722,-0.582994,0.641537,0.701096,-0.686125,0.654941,0.018351,0.120024,-0.907561
4,20003425,-0.600642,-0.369953,-0.518536,-0.95912,-0.416439,0.34315,-0.855857,-0.213322,0.098074,...,-0.116267,0.183969,-0.133363,0.307279,0.665704,0.044973,0.604583,0.22656,0.613534,-0.298996


In [36]:
pubmed_avg_pca = pca(df.iloc[:, 1:], 0.90)
pubmed_avg_pca.shape

(19199, 83)


(19199, 83)

In [37]:
df.iloc[:, 1:10].describe()

Unnamed: 0,1,2,3,4,5,6,7,8,9
count,19199.0,19199.0,19199.0,19199.0,19199.0,19199.0,19199.0,19199.0,19199.0
mean,-0.540947,-0.094074,-0.153426,-0.712511,-0.087776,0.499041,-0.951525,0.053724,-0.135272
std,0.220388,0.247458,0.283627,0.19533,0.197819,0.323319,0.231913,0.284209,0.233134
min,-1.709716,-1.127518,-1.642194,-1.659191,-1.134177,-1.069132,-2.095284,-1.202635,-1.331885
25%,-0.680206,-0.248961,-0.320883,-0.841744,-0.215987,0.300507,-1.098054,-0.134865,-0.282983
50%,-0.548353,-0.093743,-0.126838,-0.718714,-0.083815,0.514568,-0.964454,0.062533,-0.137748
75%,-0.40822,0.058763,0.040622,-0.589291,0.044428,0.721662,-0.822076,0.244435,0.008196
max,0.570453,1.25239,0.892609,0.412416,0.728293,1.56265,0.605821,1.44735,1.080881


In [38]:
# Export the DataFrame to a CSV file
pubmed_avg_pca = pd.DataFrame(pubmed_avg_pca)
pubmed_avg_pca.to_csv('/content/drive/MyDrive/pubmed_avg_pca.csv', index=False)

In [39]:
# PCA - max
df = pd.read_csv('pubmed_max_filtered.csv')
print(df.shape)
df.head()

(19199, 769)


Unnamed: 0,id,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,768
0,20001305,0.116131,0.101565,-0.154592,-0.210217,0.457556,0.714431,-0.664123,0.273169,-0.06018,...,-0.350302,0.17258,-0.148385,0.505068,1.131303,0.179012,0.592597,0.536746,0.436979,-0.421962
1,20001361,-0.250039,0.444062,0.716169,-0.279527,0.216697,0.852049,-0.1231,0.29668,0.683901,...,0.067592,0.277175,-0.277855,0.790381,1.2696,0.561676,0.556831,0.106106,1.179237,-0.035907
2,20001770,-0.318809,0.005525,0.370038,-0.170966,0.403131,0.699412,-0.799022,0.60745,0.744418,...,-0.165406,0.160972,-0.109873,0.753838,1.531666,0.045621,0.80461,-0.079868,0.630716,-0.232106
3,20002506,-0.310161,0.444259,0.559119,-0.400462,0.230889,0.022556,-0.487383,0.438878,0.279286,...,0.224865,0.465624,-0.22075,0.929504,0.796072,-0.43759,1.063125,0.404077,0.214056,-0.669795
4,20003425,-0.148293,-0.194348,-0.262155,-0.670062,-0.079511,0.80691,-0.76067,-0.150775,0.252627,...,0.252866,0.413969,-0.003944,0.481344,1.009188,0.341325,1.038453,0.510099,1.133817,-0.086359


In [43]:
pubmed_max_pca = pca(df.iloc[:, 1:], 0.90)
pubmed_max_pca.shape

(19199, 245)


In [14]:
df.iloc[:, 1:10].describe()

Unnamed: 0,1,2,3,4,5,6,7,8,9
count,19199.0,19199.0,19199.0,19199.0,19199.0,19199.0,19199.0,19199.0,19199.0
mean,-0.279342,0.168207,0.125015,-0.481349,0.127789,0.850275,-0.667917,0.359554,0.140609
std,0.283089,0.342288,0.373056,0.294367,0.246901,0.318469,0.358482,0.374164,0.332743
min,-1.709716,-1.06571,-1.642194,-1.659191,-1.134177,-1.069132,-2.095284,-1.202635,-1.331885
25%,-0.453882,-0.059275,-0.105793,-0.69185,-0.028663,0.719191,-0.914006,0.112664,-0.07646
50%,-0.270913,0.177764,0.15966,-0.501217,0.131579,0.904415,-0.68686,0.373679,0.124519
75%,-0.093046,0.401712,0.382289,-0.279355,0.289934,1.054633,-0.451541,0.614859,0.348025
max,1.161791,1.688668,1.41769,0.662017,1.066363,1.736414,0.846251,1.782243,1.79417


In [44]:
# Export the DataFrame to a CSV file
pubmed_max_pca = pd.DataFrame(pubmed_max_pca)
pubmed_max_pca.to_csv('/content/drive/MyDrive/pubmed_max_pca.csv', index=False)

In [45]:
# PCA - random
df = pd.read_csv('pubmed_rand_filtered.csv')
print(df.shape)
df.head()

(19199, 769)


Unnamed: 0,id,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,768
0,25696644.0,-0.961882,-0.149763,-0.030739,-1.021122,0.191997,-0.305727,-0.699893,-0.370058,-0.260747,...,-0.09094,0.739634,-0.700481,0.436043,1.198344,-0.129276,0.011377,0.181033,0.657025,-0.71831
1,26048429.0,-0.424305,-0.159977,-0.158158,-0.804049,-0.107067,0.969735,-1.0707,-0.02891,-0.361005,...,-0.067264,0.025152,-0.046377,0.057703,0.960852,-0.09118,0.868535,-0.034059,1.105601,-0.274495
2,20214994.0,-0.447072,-0.479759,-0.389428,-0.755359,0.238575,1.226583,-1.005992,-0.461748,0.192283,...,-0.264748,0.254763,-0.15644,-0.025247,1.146273,-0.28628,0.136631,-0.222716,0.371108,-0.07618
3,23559586.0,-0.736017,0.23011,1.104697,-0.541268,-0.361138,-0.35682,-1.131791,0.329295,-0.190179,...,0.157699,-0.016856,-0.400899,0.286962,0.302196,-0.44993,0.583222,-0.040345,0.822514,-0.462158
4,24181354.0,-0.927706,0.178118,0.555045,-1.186871,-0.107297,-0.137812,-0.570643,-0.49785,-0.53474,...,-0.21095,0.260646,-0.443489,0.493528,0.777205,-0.347572,0.594686,-0.174461,0.775348,-0.33019


In [46]:
pubmed_rand_pca = pca(df.iloc[:, 1:], 0.90)
pubmed_rand_pca.shape

(19199, 94)


(19199, 94)

In [47]:
df.iloc[:, 1:10].describe()

Unnamed: 0,1,2,3,4,5,6,7,8,9
count,19199.0,19199.0,19199.0,19199.0,19199.0,19199.0,19199.0,19199.0,19199.0
mean,-0.538486,-0.09242,-0.153788,-0.713754,-0.088627,0.497843,-0.950703,0.05292,-0.133751
std,0.331951,0.346553,0.385997,0.28541,0.283617,0.47495,0.347335,0.403165,0.339457
min,-1.709716,-1.515076,-1.642194,-1.753833,-1.40555,-1.225964,-2.163619,-1.298454,-1.541923
25%,-0.756607,-0.335489,-0.413632,-0.90722,-0.270351,0.138293,-1.18348,-0.23008,-0.356437
50%,-0.527452,-0.104973,-0.144998,-0.734944,-0.075192,0.612,-0.985972,0.05524,-0.141439
75%,-0.310704,0.137783,0.12135,-0.544185,0.10195,0.867688,-0.749105,0.327957,0.078165
max,0.669352,1.671811,1.300701,0.555422,0.999204,1.736414,0.831605,1.77461,1.373985


In [48]:
# Export the DataFrame to a CSV file
pubmed_rand_pca = pd.DataFrame(pubmed_rand_pca)
pubmed_rand_pca.to_csv('/content/drive/MyDrive/pubmed_rand_pca.csv', index=False)