# A3
Ziqi Zhang

## Read training and testing file, then clean their body text

In [33]:
import pandas as pd
df_train = pd.read_csv("10k_filings_train.csv")

In [34]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords_set = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tonyz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
import re
from bs4 import BeautifulSoup
WORD_RE = re.compile(r"\b[a-z]\w+\b")
def clean10k_text(text):
  # Some of the body still contains HTML
  soup = BeautifulSoup(text)
  text = soup.get_text(" ")
  # Lowercase
  text = text.lower()
  # Tokenize
  text = WORD_RE.findall(text)
  # Remove stopwords
  cleaned_text = [w for w in text if w not in stopwords_set]
  return " ".join(cleaned_text)

df_train["cleaned_body"] = df_train["body"].map(clean10k_text)

In [119]:
#Embedding
import gensim.downloader
glove_vectors = gensim.downloader.load("glove-wiki-gigaword-300")

In [120]:
import numpy as np
def average_embeddings(document_text):
  embeddings = []
  for word in document_text.split():
    if word in glove_vectors:
      embeddings.append(glove_vectors[word])
  if len(embeddings) == 0:
    return np.zeros((1, 300))
  embeddings = np.array(embeddings)
  return np.average(embeddings, axis=0).reshape(1, -1)

glove_doc_embeddings = []
for i, row in df_train.iterrows():
  doc_embed = average_embeddings(row.cleaned_body)
  glove_doc_embeddings.append(doc_embed)

glove_doc_embeddings = np.array(glove_doc_embeddings).squeeze()

test_df=pd.read_csv("10k_filings_test_wo_labels.csv")
test_df["cleaned_body"] = test_df["body"].map(clean10k_text)
test_doc_embeddings = []
for i, row in test_df.iterrows():
  doc_embed = average_embeddings(row.cleaned_body)
  test_doc_embeddings.append(doc_embed)

test_doc_embeddings = np.array(test_doc_embeddings).squeeze()

## Naive Bayes

In [109]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
pipe = Pipeline([('tfid',TfidfVectorizer(tokenizer=str.split,min_df=5)), ('toarray',FunctionTransformer(lambda tfid: tfid.toarray())),('gnb',GaussianNB())])

In [121]:
from sklearn.model_selection import cross_validate
X=df_train["cleaned_body"]
y=df_train.label
pipe.fit(X, y).predict(X)
cv_results = cross_validate(pipe, X=X, y=y, scoring="f1_micro", return_train_score=True)
print(f"Avg train score: {np.average(cv_results['train_score']):.2}\tAvg test score: {np.average(cv_results['test_score']):.2}")

Avg train score: 0.99	Avg test score: 0.65


In [123]:
X2=glove_doc_embeddings
GNB=GaussianNB()
GNB.fit(X2,y).predict(X2)
cv_results = cross_validate(GNB, X=X2, y=y, scoring="f1_micro", return_train_score=True)
print(f"Avg train score: {np.average(cv_results['train_score']):.2}\tAvg test score: {np.average(cv_results['test_score']):.2}")

Avg train score: 0.67	Avg test score: 0.65


In [124]:
df_test["prediction_labels"]=GNB.predict(test_doc_embeddings)
df_out=df_test[["id","prediction_labels"]]
df_out.to_csv('naive_bayes_pred_zzhan178.csv',index=False)
df_out

Unnamed: 0,id,prediction_labels
0,935036-2017-1,Services
1,1435049-2017-1,Manufacturing
2,1158449-2017-1,Retail Trade
3,849706-2017-1,Wholesale Trade
4,105608-2017-1,Wholesale Trade
...,...,...
756,1675634-2017-4,Services
757,1614556-2017-4,"Finance, Insurance, And Real Estate"
758,1557798-2017-4,"Finance, Insurance, And Real Estate"
759,1564709-2017-4,Retail Trade


## Logistic Regression

In [127]:
from sklearn.linear_model import LogisticRegression
pipe2 = Pipeline([('tfid',TfidfVectorizer(tokenizer=str.split,min_df=5)), ('toarray',FunctionTransformer(lambda tfid: tfid.toarray())), ('clf', LogisticRegression())])
pipe2.fit(X, y).predict(X)
cv_results = cross_validate(pipe2, X=X, y=y, scoring="f1_micro", return_train_score=True)
print(f"Avg train score: {np.average(cv_results['train_score']):.2}\tAvg test score: {np.average(cv_results['test_score']):.2}")

Avg train score: 0.87	Avg test score: 0.8


In [129]:
LR = LogisticRegression(random_state=0,max_iter=10000,multi_class="multinomial")
LR.fit(X2, y).predict(X2)
cv_results = cross_validate(LR, X=X2, y=y, scoring="f1_micro", return_train_score=True)
print(f"Avg train score: {np.average(cv_results['train_score']):.2}\tAvg test score: {np.average(cv_results['test_score']):.2}")

Avg train score: 0.81	Avg test score: 0.79


In [133]:
df_test["prediction_labels"]=LR.predict(test_doc_embeddings)
df_out=df_test[["id","prediction_labels"]]
df_out.to_csv('logistic_pred_zzhan178.csv',index=False)
df_out

Unnamed: 0,id,prediction_labels
0,935036-2017-1,Services
1,1435049-2017-1,Manufacturing
2,1158449-2017-1,Retail Trade
3,849706-2017-1,Manufacturing
4,105608-2017-1,Manufacturing
...,...,...
756,1675634-2017-4,Services
757,1614556-2017-4,Services
758,1557798-2017-4,Services
759,1564709-2017-4,Manufacturing
