In [1]:
import sys
sys.path.append('src')

***
## Module Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

In [3]:
from data_utils import load_data
from dimension.lexical import *

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/victorbadenas/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


***
## Data

In [4]:
train_data, test_data = load_data('data/')
print(f"train_data samples: {len(train_data)}, test_data samples: {len(test_data)}")

train_data samples: 2234, test_data samples: 3108


In [5]:
train_data.head()

Unnamed: 0,S1,S2,Gs
0,But other sources close to the sale said Viven...,But other sources close to the sale said Viven...,4.0
1,Micron has declared its first quarterly profit...,Micron's numbers also marked the first quarter...,3.75
2,The fines are part of failed Republican effort...,"Perry said he backs the Senate's efforts, incl...",2.8
3,"The American Anglican Council, which represent...","The American Anglican Council, which represent...",3.4
4,The tech-loaded Nasdaq composite rose 20.96 po...,The technology-laced Nasdaq Composite Index <....,2.4


In [6]:
test_data.head()

Unnamed: 0,S1,S2,Gs
0,The problem likely will mean corrective change...,He said the problem needs to be corrected befo...,4.4
1,The technology-laced Nasdaq Composite Index .I...,The broad Standard & Poor's 500 Index .SPX inc...,0.8
2,"""It's a huge black eye,"" said publisher Arthur...","""It's a huge black eye,"" Arthur Sulzberger, th...",3.6
3,SEC Chairman William Donaldson said there is a...,"""I think there's a building confidence that th...",3.4
4,Vivendi shares closed 1.9 percent at 15.80 eur...,"In New York, Vivendi shares were 1.4 percent d...",1.4


***
## Feature loading

### feature vector builder for dataframe of sentence pairs

Declaration of the function responsible for the iteration over the dataframe containing the sentence pairs (other columns shall be unused). Requires the sentences columns' to be named `"S1"` and `"S2"`.

Returns a numpy array of shape `(n_sentence_pairs, n_features)`

In [7]:
def get_features(df:pd.DataFrame):
    assert "S1" in df.columns, "S1 not in dataframe"
    assert "S2" in df.columns, "S2 not in dataframe"

    features = [None]*len(df) # preallocated for memory efficiency

    for index, row in df.iterrows():
        sentence1, sentence2 = row['S1'], row['S2']
        
        # Get all words
        tokenized_1, tokenized_2 = get_tokenized_sentences(sentence1, sentence2, return_unique_words=False)
        tokenized_lc_1, tokenized_lc_2 = get_tokenized_sentences_lowercase(tokenized_1, tokenized_2, return_unique_words=False)

        # Get words without stopwords
        no_stopwords_1, no_stopwords_2 = get_tokenized_without_stopwords(tokenized_1, tokenized_2, return_unique_words=False)
        no_stopwords_lc_1, no_stopwords_lc_2 = get_tokenized_without_stopwords(tokenized_lc_1, tokenized_lc_2, return_unique_words=False)
        
        # Lemmas
        lemmatized_1, lemmatized_2 = get_lemmas(tokenized_1, tokenized_2, return_unique_words=False)
        lemmatized_lc_1, lemmatized_lc_2 = get_lemmas(tokenized_lc_1, tokenized_lc_2, return_unique_words=False)        
        
        # Name entities
        sentence_ne_1, sentence_ne_2 = get_named_entities(tokenized_1, tokenized_2)
        
        # Features
        features[index] = [
            None
        ] # TODO: declare feature vector from feature extraction functions
    return np.array(features)

In [8]:
# TEST cell don't delete it =D

first = "My Bonnie White lies over the ocean, in Picadilli Circus at 3:00pm."
second = "My Bonnie lied over the sea! Over the sea..."

tokenized_1, tokenized_2 = get_tokenized_sentences(first, second, return_unique_words=False)
tokenized_lc_1, tokenized_lc_2 = get_tokenized_sentences_lowercase(tokenized_1, tokenized_2, return_unique_words=False)

print(get_lemmas(tokenized_1, tokenized_2))
print(get_lemmas(tokenized_lc_1, tokenized_lc_2))

#TEST cell

(['My', 'Bonnie', 'White', 'lie', 'over', 'the', 'ocean', ',', 'in', 'Picadilli', 'Circus', 'at', '3:00pm', '.'], ['My', 'Bonnie', 'lie', 'over', 'the', 'sea', '!', 'Over', 'the', 'sea', '...'])
(['my', 'bonnie', 'white', 'lie', 'over', 'the', 'ocean', ',', 'in', 'picadilli', 'circus', 'at', '3:00pm', '.'], ['my', 'bonnie', 'lie', 'over', 'the', 'sea', '!', 'over', 'the', 'sea', '...'])


### Train features extraction

Using the function declared above, the features are extracted from the `train_data` dataframe. Also the Gold Standard is extracted from its column in the dataframe. The shapes for both numpy vectors are displayed. 

In [9]:
train_features = get_features(train_data)
train_gs = train_data['Gs'].to_numpy()
print(f"train_features.shape: {train_features.shape}")
print(f"train_gs.shape: {train_gs.shape}")

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/home/victorbadenas/nltk_data'
    - '/home/victorbadenas/anaconda3/envs/IHLT3.7/nltk_data'
    - '/home/victorbadenas/anaconda3/envs/IHLT3.7/share/nltk_data'
    - '/home/victorbadenas/anaconda3/envs/IHLT3.7/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


### Feature scaling

features are scaled using sklearns StandardScaler, where the mean is substracted for each feature and it's divided by the variance of the feature to obtain a unified feature space with zero mean and unit variance.

In [None]:
# Scale train features
scaler = StandardScaler()
scaler.fit(train_features)
train_features_scaled = scaler.transform(train_features)

### SVM Training

In [None]:
svr = SVR(kernel = 'rbf', gamma = 0.01, C = 100, epsilon = 0.75, tol = 1)
svr.fit(train_features_scaled, train_gs)