In [1]:
import numpy as np
from pathlib import Path

# Text classification
In this notebook we will explore a basic text classification pipeline.

## Subjectivity Dataset
The subjectivity dataset has 5000 subjective and 5000 objective processed sentences. To get the data:

In [2]:
def unpack_dataset():
    ! wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
    ! mkdir data
    ! tar -xvf rotten_imdb.tar.gz -C data

In [3]:
#unpack_dataset()

In [4]:
!ls data

plot.tok.gt9.5000   quote.tok.gt9.5000  subjdata.README.1.0


In [5]:
! head -2 data/plot.tok.gt9.5000

the movie begins in the past where a young boy named sam attempts to save celebi from a hunter . 
emerging from the human psyche and showing characteristics of abstract expressionism , minimalism and russian constructivism , graffiti removal has secured its place in the history of modern art while being created by artists who are unconscious of their artistic achievements . 


In [6]:
PATH = Path("data")
list(PATH.iterdir())

[PosixPath('data/plot.tok.gt9.5000'),
 PosixPath('data/subjdata.README.1.0'),
 PosixPath('data/quote.tok.gt9.5000')]

### Tokenization 
Tokenization is the task of chopping up text into pieces, called tokens. spaCy is an open-source software library for advanced Natural Language Processing. Here we will use it for tokenization. To install spaCy:

pip install -U pip setuptools wheel <br>
pip install -U spacy <br>
python -m spacy download en_core_web_sm

In [7]:
def read_file(path):
    """ Read file returns a list of lines.
    """
    with open(path, encoding = "ISO-8859-1") as f:
        content = f.readlines()
    return content

In [8]:
obj_lines = read_file(PATH/"plot.tok.gt9.5000")
obj_lines[0]

'the movie begins in the past where a young boy named sam attempts to save celebi from a hunter . \n'

In [9]:
# simple tokenization by splitting on spaces
np.array(obj_lines[0].strip().lower().split(" "))

array(['the', 'movie', 'begins', 'in', 'the', 'past', 'where', 'a',
       'young', 'boy', 'named', 'sam', 'attempts', 'to', 'save', 'celebi',
       'from', 'a', 'hunter', '.'], dtype='<U8')

In [10]:
import spacy

In [11]:
nlp = spacy.load('en_core_web_sm')

In [12]:
# here is an example of what you can do with spacy.
# I can print the token and the "part of speach"
line = nlp(obj_lines[0])
for token in line:
    print(token.text, token.pos_)

the DET
movie NOUN
begins VERB
in ADP
the DET
past NOUN
where ADV
a DET
young ADJ
boy NOUN
named VERB
sam PROPN
attempts NOUN
to PART
save VERB
celebi NOUN
from ADP
a DET
hunter NOUN
. PUNCT

 SPACE


In [13]:
np.array([token for token in line])

array([the, movie, begins, in, the, past, where, a, young, boy, named,
       sam, attempts, to, save, celebi, from, a, hunter, ., 
], dtype=object)

In [14]:
def tokenize_sentence(s):
    s = nlp(str(s.strip()))
    return np.array([token for token in s])

In [15]:
tokenize_sentence(obj_lines[0])

array([the, movie, begins, in, the, past, where, a, young, boy, named,
       sam, attempts, to, save, celebi, from, a, hunter, .], dtype=object)

### Split dataset in train and validation

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
sub_content = read_file(PATH/"quote.tok.gt9.5000")
obj_content = read_file(PATH/"plot.tok.gt9.5000")

In [18]:
# creating labels
sub_y = np.zeros(len(sub_content))
obj_y = np.ones(len(obj_content))
X = np.append(sub_content, obj_content)
y = np.append(sub_y, obj_y)

In [19]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
X_train[:3], y_train[:3]

(array(['will god let her fall or give her a new path ? \n',
        "the director's twitchy sketchbook style and adroit perspective shifts grow wearisome amid leaden pacing and indifferent craftsmanship ( most notably wretched sound design ) . \n",
        "welles groupie/scholar peter bogdanovich took a long time to do it , but he's finally provided his own broadside at publishing giant william randolph hearst . \n"],
       dtype='<U693'),
 array([1., 0., 0.]))

### Tokenize all

In [21]:
X_train_t = [tokenize_sentence(X_train[i]) for i in range(X_train.shape[0])]

In [22]:
X_val_t = [tokenize_sentence(X_val[i]) for i in range(X_val.shape[0])]

In [23]:
X_train_t[0]

array([will, god, let, her, fall, or, give, her, a, new, path, ?],
      dtype=object)

## word embeddings from spaCy

In [24]:
nlp("am").vector

array([-0.5799826 , -1.009465  ,  0.62448525, -0.3746994 ,  0.04847589,
        0.23544501,  0.16930383, -0.5892657 , -0.4099527 , -0.0397221 ,
        0.09905334, -0.17823452,  1.8112568 , -0.41191506, -0.1911665 ,
        0.26377833, -0.20858057,  0.24163836, -0.8667144 , -0.40949103,
        0.70684826,  0.40622658,  0.00363696,  0.16584054, -0.22824581,
        0.9838457 ,  0.71979254, -0.33249682, -0.73977196,  0.1915259 ,
       -0.9682615 , -0.09848729,  0.06264922, -0.24534428, -0.30293864,
       -0.5626753 ,  1.1686754 , -0.0945009 , -0.46837738, -0.47827306,
        0.02991231,  0.34009436,  0.88246363,  0.27727336, -0.21391977,
       -0.5353824 ,  0.5017903 , -0.09786895, -1.1179507 ,  0.5685893 ,
        0.50877166,  0.2935202 , -0.12042533,  0.46025705,  0.3654778 ,
        0.15245938,  0.48111284,  0.0421572 ,  1.1205167 , -0.87217855,
        0.46665782, -0.60350204,  1.783681  ,  0.27612534,  0.00711006,
        0.55750287, -0.8106657 , -1.1621315 , -0.46883765,  0.23

### Sentence enconding
Each sentence is going to be represented with the average of the word embeddings

In [25]:
X_train_t[0]

array([will, god, let, her, fall, or, give, her, a, new, path, ?],
      dtype=object)

In [26]:
x = np.array([x.vector for x in X_train_t[0]])
x.shape

(12, 96)

In [27]:
x.mean(axis=0).shape

(96,)

In [28]:
def sentence_encoding(s):
    v = np.array([x.vector for x in s])
    return v.mean(axis=0)

In [29]:
x_train = np.array([sentence_encoding(X_train_t[i]) for i in range(len(X_train_t))])

In [30]:
x_val = np.array([sentence_encoding(X_val_t[i]) for i in range(len(X_val_t))])

In [31]:
x_train.shape, x_val.shape 

((8000, 96), (2000, 96))

### Logistic regression 

In [32]:
from sklearn.linear_model import LogisticRegression

In [33]:
clf = LogisticRegression(random_state=0, C=1).fit(x_train, y_train)

In [34]:
clf.score(x_val, y_val)

0.79

## Encoding V2: tfidf
In the tfidf formula each word in a document is weighted by 
$$tf_{i,j} \cdot log \frac{N}{df_i}$$

$tf_{i,j}$ is the number of occurences of word $i$ in doc $j$ <br>
$df_i$ is the number of documents containing $i$ <br>
$N$ is the number of documents <br>

We will be encoding a sentence using a word embeddings weighted by tfidf 

In [37]:
set(X_train_t[0])

{will, god, let, her, fall, or, give, her, a, new, path, ?}

In [52]:
def counts_per_doc(docs):
    counts = {}
    for i in range(len(docs)):
        words = set([str(x) for x in docs[i]]) 
        for word in words:
            counts[word] = counts.get(word, 0) + 1
    return counts

In [53]:
N = len(X_train_t)
N

8000

In [56]:
counts = counts_per_doc(X_train_t)

In [57]:
def compute_tfidf(counts, N):
    tfidf = {"UNK": np.log(N) }
    for w in counts:
        tfidf[w] = np.log(N/counts[w])
    return tfidf

In [58]:
tfidf = compute_tfidf(counts, N)

In [60]:
#tfidf

In [64]:
def sentence_encoding_v2(s, tfidf=tfidf):
    v = np.array([x.vector*(tfidf.get(str(x), tfidf["UNK"])) for x in s])
    return v.mean(axis=0)

In [62]:
for w in X_train_t[0]:
    print(w, counts[str(w)])

will 286
god 34
let 22
her 664
fall 29
or 305
give 46
her 664
a 4311
new 303
path 23
? 145


In [66]:
x_train = np.array([sentence_encoding_v2(X_train_t[i]) for i in range(len(X_train_t))])
x_val = np.array([sentence_encoding_v2(X_val_t[i]) for i in range(len(X_val_t))])

In [71]:
clf = LogisticRegression(random_state=0, C=1).fit(x_train, y_train)

In [72]:
clf.score(x_val, y_val)

0.784

## References
https://www.cs.cornell.edu/home/llee/papers/sentiment.home.html

# Lab
Here are some ideas on how to modify this pipeline:
1. Make a model that just uses the part of speach tags. Do you get any signal? Then add this infor to the current model.
2. Make a version of the first model without stopwords and punctuations.
3. Keep just top words. 
4. Use unique words per sentence.
5. Use just adjectives and adverbes.
6. Apply any of these techiques to this other dataset (https://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz)

In [101]:
from spacy.lang.en.stop_words import STOP_WORDS

In [103]:
#STOP_WORDS

In [86]:
import pandas as pd
words = pd.DataFrame({"x":["a", "b", "c"]})
words2 = pd.DataFrame({"x":["a", "b", "d"]})

In [106]:
def df_string_to_cat(df, colname):
    df[colname] = df[colname].astype('category').cat.as_ordered()
    return df[colname].cat.categories

def df_cat_to_catcode(df, col):
    df[col] = df[col].cat.codes + 1
            
def df_apply_cats(df_test, colname, catencoders):       
    df_test[colname] = pd.Categorical(df_test[colname], categories=encoder, ordered=True)

In [107]:
import pandas as pd
words = pd.DataFrame({"x":["a", "b", "c"]})
words2 = pd.DataFrame({"x":["a", "b", "d"]})

In [108]:
encoder = df_string_to_cat(words, 'x')

In [109]:
encoder

Index(['a', 'b', 'c'], dtype='object')

In [110]:
df_cat_to_catcode(words, 'x')

In [111]:
words

Unnamed: 0,x
0,1
1,2
2,3


In [112]:
df_apply_cats(words2, 'x', encoder)

In [113]:
words2

Unnamed: 0,x
0,a
1,b
2,
