# Replication of Lazy Prices

## 1. Basic prep: measuring the document similarity

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
import nltk
from nltk.metrics.distance import edit_distance
from nltk.tokenize import TreebankWordTokenizer

In [2]:
df = ["We expect demand to increase.",
      "We expect worldwide demand to increase.",
      "We expect weakness in sales."]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df)
vectorizer.get_feature_names_out()
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,demand,expect,in,increase,sales,to,we,weakness,worldwide
0,1,1,0,1,0,1,1,0,0
1,1,1,0,1,0,1,1,0,1
2,0,1,1,0,1,0,1,1,0


In [3]:
similarity = cosine_similarity(X[0], X[1])
print("Cosine similarity between the first two documents:", similarity)

Cosine similarity between the first two documents: [[0.91287093]]


In [4]:
print("Cosine similarity between the first and third documents:", cosine_similarity(X[0], X[2]))

Cosine similarity between the first and third documents: [[0.4]]


In [5]:
# Convert the frequency counts to binary format
X_binary = (X.toarray() > 0).astype(int)
print("Jaccard similarity between the first two documents:", jaccard_score(X_binary[0],X_binary[1]))
print("Jaccard similarity between the first and third documents:", jaccard_score(X_binary[0],X_binary[2]))

Jaccard similarity between the first two documents: 0.8333333333333334
Jaccard similarity between the first and third documents: 0.25


In [6]:
# Convert the first two rows to dense format
doc1_vector = X.toarray()[0]
doc2_vector = X.toarray()[1]

features = vectorizer.get_feature_names_out()

# Expand each feature name by its frequency to simulate "words"
doc1_expanded = []
doc2_expanded = []
for word, count in zip(features, doc1_vector):
    doc1_expanded.extend([word] * count)
for word, count in zip(features, doc2_vector):
    doc2_expanded.extend([word] * count)

# Calculate the edit distance between the two expanded lists
distance = edit_distance(doc1_expanded, doc2_expanded)

print("Edit distance between the first two documents:", distance)

Edit distance between the first two documents: 1


## 2. Summary statistics

Let's determine the word count in each document, which is more complex than it might initially appear.

First, it's unclear which parts of the document we should count. For example, the initial lines in the original filings are not typically considered content. Additionally, the word count results can vary significantly depending on the tokenizer used.

Therefore, we plan to calculate the word count in several ways:

1. Count words in the entire filings using a whitespace tokenizer.
2. Count words in the entire filings using the TreebankWordTokenizer.
3. Count words in the Management Discussion and Analysis (MD&A) section using a whitespace tokenizer.
4. Count words in the MD&A section using the TreebankWordTokenizer.

### 2.1. Example for 10-K

In [21]:
file_path = 'Local_directory/10-X_C_1993-2000/1995/QTR1/19950103_10-K_edgar_data_44471_0000044471-95-000002.txt'
with open(file_path, 'r') as file:
    content = file.read()

In [8]:
# Check it out
# print(content)

Count words in the entire filings using a whitespace tokenizer.

In [9]:
word_count = 0
with open(file_path, 'r') as file:
    for line in file:
        words = line.split()
        word_count += len(words)
word_count

25617

Count words in the entire filings using the TreebankWordTokenizer.

In [10]:
tokenizer = TreebankWordTokenizer()
treebank_word_count = 0
with open(file_path, 'r') as file:
    content = file.read()
    # Tokenize the content using the Penn Treebank Tokenizer
    words = tokenizer.tokenize(content)
    treebank_word_count = len(words)
treebank_word_count

30044

Count words in the Management Discussion and Analysis (MD&A) section using a whitespace tokenizer.

In [11]:
from scrape_yu_z import get_itemized_10k, get_itemized_10q 
temp = get_itemized_10k(file_path)

In [12]:
len(temp['mda'].split())

52

Count words in the MD&A section using the TreebankWordTokenizer.

In [13]:
len(tokenizer.tokenize(temp['mda']))

58

### 2.2. Example for 10-Q

In [81]:
file_path = '/Local_directory/10-X_C_1993-2000/1995/QTR1/19950103_10-Q-A_edgar_data_356226_0000356226-95-000001.txt'
with open(file_path, 'r') as file:
    content = file.read()

In [41]:
#print(content)

In [42]:
word_count = 0
with open(file_path, 'r') as file:
    for line in file:
        words = line.split()
        word_count += len(words)
word_count

413

In [43]:
tokenizer = TreebankWordTokenizer()
treebank_word_count = 0
with open(file_path, 'r') as file:
    content = file.read()
    # Tokenize the content using the Penn Treebank Tokenizer
    words = tokenizer.tokenize(content)
    treebank_word_count = len(words)
treebank_word_count

568

In [48]:
temp = get_itemized_10q(file_path)
print(temp)

{'mda': None}


We need to take this case account.

Let's put them together

In [127]:
def word_count_functions(input_file):
    with open(input_file, 'r') as file:
        content = file.read()
        simple_word_count = len(content.split())
        treebank_word_count = len(tokenizer.tokenize(content))
    # use different function depending on the filings
    if '10-K' in input_file:
        try: 
            mda_content = get_itemized_10k(input_file)['mda']
        except:
            pass
    else:
        try: 
            mda_content = get_itemized_10q(input_file)['mda']
        except:
            pass
    try:
        simple_mda = len(mda_content.split())
    except:
        simple_mda = 0 # use 0 as a wildcard
    try :
        treebank_mda = len(tokenizer.tokenize(mda_content))
    except:
        treebank_mda = 0 # use 0 as a wildcard
        
    return {'filing_date':input_file.split('/')[-1].split('_')[0],
            'CIK':input_file.split('/')[-1].split('_')[-1].split('-')[0],
            'category':input_file.split('/')[-1].split('_')[1],
            'simple_word_count':simple_word_count,
            'treebank_word_count':treebank_word_count,
            'simple_mda':simple_mda,
            'treebank_mda':treebank_mda}

In [87]:
pd.DataFrame([word_count_functions(file_path)])

Unnamed: 0,filing_date,CIK,category,simple_word_count,treebank_word_count,simple_mda,treebank_mda
0,19950103,356226,10-Q-A,413,568,0,0


## 3. 10-X from 1995 to 2000

In [90]:
import os

folder_path = '/Local_directory/10-X_C_1993-2000/'
years = ['1995','1996','1997','1998','1999','2000']
quarters = ['QTR1','QTR2','QTR3','QTR4']

#files = [f for f in os.listdir(folder_path+'QTR1') if os.path.isfile(os.path.join(folder_path+'QTR1', f))]
#print("Files in '", folder_path+'QTR1', "':")
#k_files = [item for item in files if 'K' in item] 
#q_files = [item for item in files if 'Q' in item]
#print(k_files[:2])

In [None]:
for year in years:
    df = pd.DataFrame()
    for quarter in quarters:
        file_path = folder_path + year + '/' + quarter
        filings = os.listdir(file_path)
        for filing in filings:
            temp = pd.DataFrame([word_count_functions(file_path + '/' + filing)])
            df = pd.concat([df,temp],ignore_index=True)
        print(year, quarter)
    df.to_csv('Local_directory/julex_data/'+year+'.csv.gz',
              compression='gzip')