In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split



1. truncate item 7 to avoid common parttern in text

In [2]:
df = pd.read_pickle('datasets/features-label.pkl')
df_copy = df.copy()
df_copy.shape

(43290, 9)

In [3]:
df_copy.head()

Unnamed: 0,CIK,file_id,year,roe,opinc,nopinc,preprocessed_item_7,token_count,roe_next_year
0,1750,3,2005,0.095362,0.131652,-0.015174,"[general, overview, report, activities, four, ...",2320,0.127945
1,1750,4,2006,0.127945,0.149182,-0.022282,"[forwardlooking, statements, managements, disc...",3419,0.13922
2,1750,7,2009,0.063607,0.105913,-0.030043,"[forwardlooking, statements, managements, disc...",3564,0.088296
3,1750,8,2010,0.088296,0.136418,-0.040048,"[forwardlooking, statements, managements, disc...",3245,0.079613
4,1750,9,2011,0.079613,0.138888,-0.026679,"[forwardlooking, statements, managements, disc...",2703,0.061607


In [4]:
df_copy.dtypes

CIK                     object
file_id                 object
year                     int64
roe                    float64
opinc                  float64
nopinc                 float64
preprocessed_item_7     object
token_count              int64
roe_next_year          float64
dtype: object

In [4]:
# for text longer than 1000, remove the first and last 100 words
# for text shorter than 1000, remove the first and last 50 words
# update token count
def remove_first_last_words(row):
    words = row['preprocessed_item_7']
    length = row['token_count']

    if length < 1000:
        words = words[50:-50]
    else:
        words = words[100:-100]

    return words, len(words)

In [5]:
df_copy[['preprocessed_item_7', 'token_count']] = df_copy.apply(remove_first_last_words, axis=1, result_type='expand')

2. split dataset and save train_ids and test_ids

In [6]:
# split by company
# training and testing data 80% and 20%
companies = df_copy[['CIK', 'file_id']].groupby('CIK')

train_ids = []
test_ids = []
random_seed = 42

for _, company in companies:
    train_group, test_group = train_test_split(company, test_size=0.2, random_state = random_seed)
    train_ids.append(train_group)
    test_ids.append(test_group)

train_ids = pd.concat(train_ids)[['file_id']]
test_ids = pd.concat(test_ids)[['file_id']]

In [7]:
train_ids.shape

(32113, 1)

In [8]:
test_ids.shape

(11177, 1)

In [9]:
train_ids.to_csv('train-test-ids/train-ids.csv', index=False)
test_ids.to_csv('train-test-ids/test-ids.csv', index=False)

3. represent item 7 by tone based on LMD

In [None]:
df_m1_1 = df_copy.copy()

In [None]:
# calculate tone based on LMD
lmd_neg = pd.read_csv('../LMD/LMD-neg-words.csv')
lmd_pos = pd.read_csv('../LMD/LMD-pos-words.csv')

def calculate_item7_tone(row):
    text = row['preprocessed_item_7']
    neg_count = 0
    pos_count = 0

    for word in text:
        if word in lmd_neg:
            neg_count += 1
        if word in lmd_pos:
            pos_count += 1

    tone = (pos_count - neg_count) / len(text)

    return tone

In [None]:
df_m1_1['tone'] = df_m1_1.apply(calculate_item7_tone, axis=1)

4. represent item 7 by word2vec

In [None]:
# use text from train data locally train word embedding model
train_data = df_copy[df_copy['file_id'].isin(train_ids['file_id'])]
train_data_text = train_data['preprocessed_item_7'].tolist()

In [None]:
# train word2vec model, using continuous bag-of-words algorithm
word_embedding_100 = Word2Vec(train_data_text)

In [None]:
# save word embedding model
word_embedding_100.save('word-embedding/word-embedding-100.model')

In [None]:
# train word2vec model, using skip-gram algorithm
word_embedding_100_sg = Word2Vec(train_data_text, sg=1)

In [None]:
# save word embedding model
word_embedding_100_sg.save('word-embedding/word-embedding-100-sg.model')

In [None]:
# use word embedding to represent text (the entire dataset)
def get_text_vector(text, model):
    vectors = [model.wv[word] for word in text if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [None]:
df_m1_2 = df_m1_1.copy()

In [None]:
df_m1_2['text_vector'] = df_m1_2['preprocessed_item_7'].apply(lambda x: get_text_vector(x, word_embedding_100))

In [None]:
# save features and label with text represented
df_m1_2.to_pickle('datasets/features-label-text-represented.pkl')