# Assignment 1

## Data Preprocessing

### Imports

In [None]:
import os
import requests
import zipfile
from tqdm import tqdm
import time
import datetime
from IPython.display import display

from typing import List, Dict, Tuple

import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, \
                            f1_score, recall_score, ConfusionMatrixDisplay

import nltk
from nltk.corpus import stopwords
import gensim
import gensim.downloader as gloader
assert int(gensim.__version__.split('.')[0]) >= 4, "Install gensim 4.x.x or above (pip install -U gensim)"

nltk.download('stopwords')
%matplotlib inline

L2_REG = 0.0001
DROPOUT_RATE = 0.2
BATCH_SIZE = 128

### Downloading the Dataset

In [5]:
ROOT_PATH = os.path.dirname(os.getcwd())
DATA_PATH = os.path.join(ROOT_PATH, 'data')

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

def download_data(data_path):
    final_path = os.path.join(data_path, 'dependency_treebank.zip')
    url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"

    if not os.path.exists(data_path):
        os.makedirs(data_path)

    if not os.path.exists(final_path):
        print("Downloading dataset...")
        with requests.Session() as current_session:
            response = current_session.get(url, stream=True)
        save_response_content(response, final_path)
        print("Download completed!")

        print("Extracting dataset...")
        with zipfile.ZipFile(final_path) as loaded_zip:
            loaded_zip.extractall(data_path)
        print("Extraction completed!")

download_data(DATA_PATH)

# The new data path is:
DATA_PATH = os.path.join(DATA_PATH, 'dependency_treebank')

The dataset is composed of several files whose naming convention is `wsj_{num}.dp`. We explore the content of the first file:

In [9]:
with open(os.path.join(DATA_PATH, 'wsj_0001.dp'), 'r') as f:
    lines = [l.rstrip() for l in f.readlines()]

print(lines)

['Pierre\tNNP\t2', 'Vinken\tNNP\t8', ',\t,\t2', '61\tCD\t5', 'years\tNNS\t6', 'old\tJJ\t2', ',\t,\t2', 'will\tMD\t0', 'join\tVB\t8', 'the\tDT\t11', 'board\tNN\t9', 'as\tIN\t9', 'a\tDT\t15', 'nonexecutive\tJJ\t15', 'director\tNN\t12', 'Nov.\tNNP\t9', '29\tCD\t16', '.\t.\t8', '', 'Mr.\tNNP\t2', 'Vinken\tNNP\t3', 'is\tVBZ\t0', 'chairman\tNN\t3', 'of\tIN\t4', 'Elsevier\tNNP\t7', 'N.V.\tNNP\t12', ',\t,\t12', 'the\tDT\t12', 'Dutch\tNNP\t12', 'publishing\tVBG\t12', 'group\tNN\t5', '.\t.\t3']


The content of the files is a sequence of lines, each line containing a word, a tag and a number separated by `\t` tags. We can define a function that given a file processes its content and returns a Pandas `DataFrame` for it. We ignore the third element.

In [15]:
def process_file(filepath: str):
    # They are not technically csv files, but they are text files
    # so we can still use the same function.
    with open(filepath, 'r') as f:
        df = pd.read_csv(f, sep='\t', names=['word','tag','drop'])
    df = df.drop(['drop'], axis=1)
    return df

# Example:
process_file(os.path.join(DATA_PATH, 'wsj_0001.dp'))

Unnamed: 0,word,tag
0,Pierre,NNP
1,Vinken,NNP
2,",",","
3,61,CD
4,years,NNS
5,old,JJ
6,",",","
7,will,MD
8,join,VB
9,the,DT


## Embeddings

## Baseline model

## Experiments

## Evaluation

## Error Analysis