In [1]:
import pandas as pd
import numpy as np
import pickle
import time
import math
import seaborn as sns
import matplotlib.pyplot as plt
import glob
import re
import csv

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
import os

In [2]:
pd.set_option('display.max_columns', 50)
# change this directory to a folder with all the files you want in it 
directory_path = '/home/jhj5dh/ARI/dspg22ari2/BERT_Analysis/GAT'

# list of file names in the directory
list_of_files = os.listdir(directory_path)
sorted_files = sorted(list_of_files)
#print(sorted_files)

# initialize an empty dataframe to store the text documents
df = pd.DataFrame(columns=['text'])

for filename in sorted_files:
    file_path = os.path.join(directory_path, filename)
    # check if the path is a file
    if os.path.isfile(file_path):
        # read the contents of the file and append as a new row to the dataframe
        print(filename)
        text = pd.read_csv(file_path, error_bad_lines=False, encoding='utf-8', quoting=csv.QUOTE_NONE, lineterminator='.', header=None)[0].str.cat().strip()
        df = pd.concat([df, pd.DataFrame({'text': [text]})], ignore_index=True)



ADP_6.22_07_2019.txt
GAT_bad_performer_adaptability.txt
GAT_bad_performer_bad_coping.txt
GAT_bad_performer_catastrophizing.txt
GAT_bad_performer_character.txt
GAT_bad_performer_depression.txt
GAT_bad_performer_family_closeness.txt
GAT_bad_performer_family_satisfaction.txt
GAT_bad_performer_family_support.txt
GAT_bad_performer_friendship.txt
GAT_bad_performer_good_coping.txt
GAT_bad_performer_life_meaning.txt
GAT_bad_performer_loneliness.txt
GAT_bad_performer_negative_affect.txt
GAT_bad_performer_non-work_interests.txt
GAT_bad_performer_optimism.txt
GAT_bad_performer_organizational_trust.txt
GAT_bad_performer_positive_affect.txt
GAT_bad_performer_work_engagement.txt
GAT_good_performer_adaptability.txt
GAT_good_performer_bad_coping.txt
GAT_good_performer_catastrophizing.txt
GAT_good_performer_character.txt
GAT_good_performer_depression.txt
GAT_good_performer_family_closeness.txt
GAT_good_performer_family_satisfaction.txt
GAT_good_performer_family_support.txt
GAT_good_performer_friendship

b'Skipping line 2: expected 1 fields, saw 3\nSkipping line 3: expected 1 fields, saw 2\nSkipping line 10: expected 1 fields, saw 3\nSkipping line 1000: expected 1 fields, saw 3\nSkipping line 3036: expected 1 fields, saw 4\nSkipping line 8094: expected 1 fields, saw 7\nSkipping line 8095: expected 1 fields, saw 2\nSkipping line 8097: expected 1 fields, saw 3\nSkipping line 8098: expected 1 fields, saw 5\nSkipping line 8103: expected 1 fields, saw 3\nSkipping line 8104: expected 1 fields, saw 2\nSkipping line 8108: expected 1 fields, saw 4\nSkipping line 8110: expected 1 fields, saw 3\nSkipping line 8111: expected 1 fields, saw 6\nSkipping line 8118: expected 1 fields, saw 2\nSkipping line 8122: expected 1 fields, saw 2\nSkipping line 8123: expected 1 fields, saw 2\nSkipping line 8124: expected 1 fields, saw 2\nSkipping line 8127: expected 1 fields, saw 2\nSkipping line 8130: expected 1 fields, saw 4\nSkipping line 8131: expected 1 fields, saw 4\nSkipping line 8133: expected 1 fields, s

In [3]:
# this line is taking out the new lines so it doesn't display them all as separate documents
df = df.replace(r'\n',' ', regex=True)
df = df.replace('', ' ', regex=True)

import unicodedata
df['text'] = df['text'].apply(lambda x: ''.join([' ' if not unicodedata.normalize('NFKD', char).encode('ASCII', 'ignore') else char for char in x]))


In [4]:
model_name = 'gpt2'
gp2_model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add padding token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def encode_document(text, max_tokens=5):
    input_ids = tokenizer.encode(text, max_length=max_tokens, truncation=True, padding='max_length', return_tensors="pt")
    vector = gp2_model.transformer.wte.weight[input_ids,:]
    document_embedding = torch.flatten(vector, start_dim=0, end_dim=1)
    document_embedding = torch.flatten(document_embedding, start_dim=0, end_dim=1).detach().numpy()
    return document_embedding

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
#https://github.com/huggingface/transformers/issues/1458

vectors = df['text'].apply(lambda x: encode_document(str(x)))
output_vectors = pd.DataFrame(vectors.tolist())
output_vectors = output_vectors.iloc[:, : 768]

# Write the output vectors to a csv file
# change the output vector name every time you change the folder
output_vectors.to_csv('output_vectors_GAT_GP2.csv', index=False)
