<a href="https://colab.research.google.com/github/vikramkrishnan9885/MyColab/blob/master/NLPSeqProcessingPytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

| Algorithm | Description | Applications |
|-----|----|----|
| One-to-one RNNs | These take a single input and give a single output. Current input depends on the previously observed input(s). | Stock market prediction, scene classification, and text generation |
| One-to-many RNNs | These take a single input and give an output consisting of an arbitrary number of elements| Image captioning |
| Many-to-one RNNs | These take a sequence of inputs and give a single output. | Sentence classification (considering a single word as a single input) |
| Many-to-many RNNs | These take a sequence of arbitrary length as inputs and outputs a sequence of arbitrary length. | Machine translation, chatbots |

# Helper functions

In [11]:
import requests

def progress_bar(some_iter):
    try:
        from tqdm import tqdm
        return tqdm(some_iter)
    except ModuleNotFoundError:
        return some_iter

def download_file_from_google_drive(id, destination):
    print("Trying to fetch {}".format(destination))

    def get_confirm_token(response):
        for key, value in response.cookies.items():
            if key.startswith('download_warning'):
                return value

        return None

    def save_response_content(response, destination):
        CHUNK_SIZE = 32768

        with open(destination, "wb") as f:
            for chunk in progress_bar(response.iter_content(CHUNK_SIZE)):
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)

    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)

# Surnames classification problem

## Data download

In [2]:
file_id_0 = '1MBiOU5UCaGpJw2keXAqOLL8PCJg_uZaU'
destination_0 = 'surnames.csv'
download_file_from_google_drive(file_id_0, destination_0)

Trying to fetch surnames.csv


6it [00:00, 1578.98it/s]


In [3]:
file_id_1 = '1T1la2tYO1O7XkMRawG8VcFcvtjbxDqU-'
destination_1 = 'surnames_with_splits.csv'
download_file_from_google_drive(file_id_1, destination_1)

Trying to fetch surnames_with_splits.csv


8it [00:00, 1560.89it/s]


## Data munging

In [4]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

In [5]:
args = Namespace(
    raw_dataset_csv="surnames.csv",
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="surnames_with_splits.csv",
    seed=1337
)

In [6]:
# Read raw data
surnames = pd.read_csv(args.raw_dataset_csv, header=0)
surnames.head()

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian


In [7]:
# Unique classes
set(surnames.nationality)

{'Arabic',
 'Chinese',
 'Czech',
 'Dutch',
 'English',
 'French',
 'German',
 'Greek',
 'Irish',
 'Italian',
 'Japanese',
 'Korean',
 'Polish',
 'Portuguese',
 'Russian',
 'Scottish',
 'Spanish',
 'Vietnamese'}

In [8]:
# Splitting train by nationality
# Create dict
by_nationality = collections.defaultdict(list)
for _, row in surnames.iterrows():
    by_nationality[row.nationality].append(row.to_dict())


# Create split data
final_list = []
np.random.seed(args.seed)
for _, item_list in sorted(by_nationality.items()):
    np.random.shuffle(item_list)
    n = len(item_list)
    n_train = int(args.train_proportion*n)
    n_val = int(args.val_proportion*n)
    n_test = int(args.test_proportion*n)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
    for item in item_list[n_train+n_val:]:
        item['split'] = 'test'  
    
    # Add to final list
    final_list.extend(item_list)


# Write split data to file
final_surnames = pd.DataFrame(final_list)


final_surnames.split.value_counts()

train    7680
test     1660
val      1640
Name: split, dtype: int64

In [9]:
final_surnames.head()

Unnamed: 0,surname,nationality,split
0,Totah,Arabic,train
1,Abboud,Arabic,train
2,Fakhoury,Arabic,train
3,Srour,Arabic,train
4,Sayegh,Arabic,train


In [10]:
# Write munged data to CSV
final_surnames.to_csv(args.output_munged_csv, index=False)

## Data processing using PyTorch

In [12]:
from argparse import Namespace
import os
import json

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook