# Classifying Text

### 1.1. Preprocessing

In [1]:
import numpy as np
import pandas as pd
import spacy
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
yelp = pd.read_csv("sentiment/yelps.csv")
# I need to work with a fraction of the data (or else my computer crashes)
yelp = yelp.sample(frac=0.1)
yelp

Unnamed: 0,business_id,positive,text
41994,SycRVheLdFcK-2jt8j5d4Q,False,If the walls of this place could speak they wo...
41774,SvsAj_yb9kGOYjknjrc45A,False,I went in with a rash on my hand. They informe...
49038,kRhjWeAPs-U5RmakIKz0Pg,True,You're out on a Saturday night and you're not ...
97374,l07ctcrDMV--TYwe3uzebQ,True,"Whenever I feel like going ""tanning"" (that's m..."
16344,iE71iwcSljg3xm2GB2Y9aA,True,This is my favorite restaurant. Not only becau...
...,...,...,...
1055,cCFWegvTavai-oOE4C4sDA,False,Make that minus one star. Even worse than the ...
60470,XXW_OFaYQkkGOGniujZFHg,False,This place suckkkkedddd. Bad food. \nI came he...
6390,LPMZ9N1sAjs2nDx7DmiZ2w,False,"Tasteless pizza, nasty sauce, and didn't liste..."
48240,oiVpyFXOAdQUHvQLwr-58g,True,Its was the 4th of July and hot as heck. I was...


In [3]:
imdb = pd.read_csv("sentiment/movies.csv")
imdb = imdb.sample(frac=0.2)
imdb

Unnamed: 0,movie,positive,text
20944,11475,False,A movie that makes you want to throw yourself ...
16558,8392,True,This film exhibits artful cinematic techniques...
310,11095,True,This movie is based on the true story of Chris...
9246,8555,False,This is a very old and cheaply made film--a ty...
49910,1520,False,Being a HUGE fan of the bottom series i was re...
...,...,...,...
41671,9983,True,"This movie is about basically human relations,..."
27120,3610,True,"In my opinion, this film has wonderful lightin..."
40395,1120,False,I saw this movie in my international cinema cl...
47449,9636,True,This is another classic Seagal movie. He walks...


In [4]:
not_alphanumeric_or_space = re.compile('[^(\w|\s|\d)]')
nlp = spacy.load('en_core_web_sm')

def preprocess(doc):
    doc = re.sub(not_alphanumeric_or_space, '', doc)
    words = [t.lemma_ for t in nlp(doc) if t.lemma_ != '-PRON-']
    return ' '.join(words).lower()

In [5]:
vect_yelp = TfidfVectorizer(min_df=2,
                         max_df=.8,
                         preprocessor=preprocess,
                         stop_words='english',
                         use_idf=True,
                         norm=False)

In [6]:
# fit the vectorizer to the Yelps dataset and get the features
features_yelp = vect_yelp.fit_transform(yelp.text)
features_yelp = features_yelp.todense()

  'stop_words.' % sorted(inconsistent))


In [7]:
# we will use this same vectorizer to transform the IMDB dataset
features_imdb = vect_yelp.transform(imdb.text)
features_imdb = features_imdb.todense()

### 1.2. Fitting the models

Now that we have vectorized our data we can fit different models in order to predict the sentiment of each text

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [9]:
# define the two basic models we will use (discriminative and generative approach)
NB = MultinomialNB(fit_prior=False)
LR = LogisticRegression(solver='lbfgs', n_jobs = -1)

In [10]:
# fit the logistic regression
y_yelp = yelp.positive
fitted_LR = LR.fit(features_yelp, y_yelp)

In [11]:
# lets get a first assessment of the accuracy of the model (for the Yelp data)
cross_val_score(fitted_LR, features_yelp, y_yelp, cv = 5, n_jobs = -1)

array([0.93041392, 0.9289858 ])

In [12]:
# fit the Naive Bayes
fitted_NB = NB.fit(features_yelp, y_yelp)

In [13]:
# get accuracy assesment (for the Yelp data as well)
cross_val_score(fitted_NB, features_yelp, y_yelp, cv = 2)

array([0.86082783, 0.87357471])

### 1.3. Transfer learning!

In [14]:
# get the target variable for the movies dataset
y_imdb = imdb.positive

In [15]:
# get predictions for both models using the fitted models!
y_NB = fitted_NB.predict(features_imdb)
y_LR = fitted_LR.predict(features_imdb)

In [16]:
# lets save the results
results = dict()

In [17]:
# calculate accuracy and store it
results['direct_transfer'] = accuracy_score(y_NB, y_imdb), accuracy_score(y_LR, y_imdb) 
print(results)

{'direct_transfer': (0.7011298870112989, 0.7381261873812619)}


Clearly we are not performing as well as we did in the Yelp data... There are some problems when transfering what we learned.

## Crosstraining 2

Try to improve your transfer score using the unlabelled target data, P(X). What could you learn from the target context (without using the labels, only from the X) that might help you when training your model on your source context? How can you construct a feature space in your source context that generalizes better? 

### Idea 1: Use only words that are common to both datasets

In [18]:
yelp_words = [word.lower() for doc in yelp.text for word in doc.split()]
yelp_words = set(yelp_words)

imdb_words = [word.lower() for doc in imdb.text for word in doc.split()]
imdb_words = set(imdb_words)

common_words = yelp_words.intersection(imdb_words)
common_words
print(' ******* Number of common words in both datasets:', len(common_words))

 ******* Number of common words in both datasets: 27666


In [19]:
not_alphanumeric_or_space = re.compile('[^(\w|\s|\d)]')
nlp = spacy.load('en_core_web_sm')

def preprocess(doc):
    doc = re.sub(not_alphanumeric_or_space, '', doc.lower())
    words = [t for t in doc if t in common_words]
    words = [t.lemma_ for t in nlp(doc) if t.lemma_ != '-PRON-']
    return ' '.join(words).lower()

In [20]:
vect_yelp = TfidfVectorizer(min_df=2,
                         max_df=.8,
                         preprocessor=preprocess,
                         stop_words='english',
                         use_idf=True,
                         norm=False)

In [21]:
# fit the vectorizer to the Yelps dataset and get the features
features_yelp = vect_yelp.fit_transform(yelp.text)
features_yelp = features_yelp.todense()

  'stop_words.' % sorted(inconsistent))


In [22]:
# we will use this same vectorizer to transform the IMDB dataset
features_imdb = vect_yelp.transform(imdb.text)
features_imdb = features_imdb.todense()

In [23]:
# define the two basic models we will use (discriminative and generative approach)
NB = MultinomialNB(fit_prior=False)
LR = LogisticRegression(solver='lbfgs', n_jobs=-1)

In [24]:
# fit the logistic regression
fitted_LR = LR.fit(features_yelp, y_yelp)

In [25]:
# fit the Naive Bayes
fitted_NB = NB.fit(features_yelp, y_yelp)

In [26]:
# get predictions for both models using the fitted models!
y_NB = fitted_NB.predict(features_imdb)
y_LR = fitted_LR.predict(features_imdb)

In [27]:
# calculate accuracy and save it
results['common_words'] = accuracy_score(y_NB, y_imdb), accuracy_score(y_LR, y_imdb) 
print(results)

{'direct_transfer': (0.7011298870112989, 0.7381261873812619), 'common_words': (0.6978302169783022, 0.7398260173982601)}


### Idea 2: Restrict the vocabulary to the words with the highes IDF in the target space

In [35]:
# generate this class just to get the IDF in the target space
vect_imdb = TfidfVectorizer(min_df=2,
                         max_df=.8,
                         preprocessor=preprocess,
                         stop_words='english',
                         use_idf=True,
                         norm=False)

In [36]:
vect_imdb.fit(imdb.text)
vect_imdb.idf_

  'stop_words.' % sorted(inconsistent))


array([9.11192806, 9.11192806, 8.13109881, ..., 9.11192806, 9.11192806,
       8.82424599])

In [37]:
idf = np.flip(np.argsort(vect_imdb.idf_))
len(idf)

28093

In [38]:
# select words with the highest IDF (25%)
idf_top = idf[0:int(len(idf)*0.25)]

In [39]:
top_words = []
for word, idx in vect_imdb.vocabulary_.items():
    if idx in idf_top:
        top_words.append(word)

In [40]:
not_alphanumeric_or_space = re.compile('[^(\w|\s|\d)]')
nlp = spacy.load('en_core_web_sm')

def preprocess(doc):
    doc = re.sub(not_alphanumeric_or_space, '', doc.lower())
    words = [t for t in doc if t in top_words]
    words = [t.lemma_ for t in nlp(doc) if t.lemma_ != '-PRON-']
    return ' '.join(words).lower()

In [41]:
vect_yelp = TfidfVectorizer(min_df=2,
                         max_df=.8,
                         preprocessor=preprocess,
                         stop_words='english',
                         use_idf=True,
                         norm=False)

In [42]:
# fit the vectorizer to the Yelps dataset and get the features
features_yelp = vect_yelp.fit_transform(yelp.text)
features_yelp = features_yelp.todense()

  'stop_words.' % sorted(inconsistent))


In [43]:
# we will use this same vectorizer to transform the IMDB dataset
features_imdb = vect_yelp.transform(imdb.text)
features_imdb = features_imdb.todense()

In [44]:
# define the two basic models we will use (discriminative and generative approach)
NB = MultinomialNB(fit_prior=False)
LR = LogisticRegression(solver='lbfgs', n_jobs=-1)

In [45]:
# fit the logistic regression
fitted_LR = LR.fit(features_yelp, y_yelp)

In [46]:
# fit the Naive Bayes
fitted_NB = NB.fit(features_yelp, y_yelp)

In [47]:
# get predictions for both models using the fitted models!
y_NB = fitted_NB.predict(features_imdb)
y_LR = fitted_LR.predict(features_imdb)

In [48]:
# calculate accuracy and save it
results['top_idf_vocab'] = accuracy_score(y_NB, y_imdb), accuracy_score(y_LR, y_imdb) 
print(results)

{'direct_transfer': (0.7011298870112989, 0.7381261873812619), 'common_words': (0.6978302169783022, 0.7398260173982601), 'top_idf_vocab': (0.6978302169783022, 0.7398260173982601)}


### Idea 3: PCA Sample selection - [Xi et al. (2015)](https://www.sentic.net/domain-adaptation-for-sentiment-classification.pdf)

1. PCA on target X
2. Identify documents in source with lower distance (Hotelling T to PCA)
3. Use only those documents (thus the sampling part) to train the model

In [49]:
not_alphanumeric_or_space = re.compile('[^(\w|\s|\d)]')
nlp = spacy.load('en_core_web_sm')

def preprocess(doc):
    doc = re.sub(not_alphanumeric_or_space, '', doc)
    words = [t.lemma_ for t in nlp(doc) if t.lemma_ != '-PRON-']
    return ' '.join(words).lower()

In [50]:
vect_yelp = TfidfVectorizer(min_df=2,
                         max_df=.8,
                         preprocessor=preprocess,
                         stop_words='english',
                         use_idf=True,
                         norm=False)

In [51]:
# fit the vectorizer to the Yelps dataset and get the features
features_yelp = vect_yelp.fit_transform(yelp.text)
features_yelp = features_yelp.todense()

  'stop_words.' % sorted(inconsistent))


In [52]:
# we will use this same vectorizer to transform the IMDB dataset
features_imdb = vect_yelp.transform(imdb.text)
features_imdb = features_imdb.todense()

In [53]:
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
pca.fit(features_imdb)

PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [54]:
loadings_p = pca.components_.T
eigenvalues = pca.explained_variance_

In [55]:
def hotelling_t2s(loadings_p, eigenvalues, xi): 
    dist = np.array(xi.dot(loadings_p)
                            .dot(np.diag(eigenvalues ** -1))
                            .dot(loadings_p.T)
                            .dot(xi.T))
    return(dist[0][0])

In [56]:
def high_hotelling(loadings_p, eigenvalues, X, y, best):
    all_dist = []
    for xi in X:
        all_dist.append(hotelling_t2s(loadings_p,eigenvalues,xi))
    
    indices = list(np.flip(np.argsort(all_dist)))[0:best]
    sub_X = X[indices]
    sub_y = np.array(y)[indices]
    
    return(sub_X, sub_y)

In [57]:
X_sub_yelp, y_sub_yelp = high_hotelling(loadings_p, eigenvalues, features_yelp, y_yelp, int(len(y_yelp)*0.1))

In [58]:
# define the two basic models we will use (discriminative and generative approach)
NB = MultinomialNB(fit_prior=False)
LR = LogisticRegression(solver='lbfgs', n_jobs = -1)

In [59]:
# fit the logistic regression
fitted_LR = LR.fit(X_sub_yelp, y_sub_yelp)

In [60]:
# fit the Naive Bayes
fitted_NB = NB.fit(X_sub_yelp, y_sub_yelp)

In [61]:
# get predictions for both models using the fitted models!
y_NB = fitted_NB.predict(features_imdb)
y_LR = fitted_LR.predict(features_imdb)

In [62]:
# calculate accuracy and save it
results['PCA_SS'] = accuracy_score(y_NB, y_imdb), accuracy_score(y_LR, y_imdb) 
print(results)

{'direct_transfer': (0.7011298870112989, 0.7381261873812619), 'common_words': (0.6978302169783022, 0.7398260173982601), 'top_idf_vocab': (0.6978302169783022, 0.7398260173982601), 'PCA_SS': (0.6446355364463554, 0.7077292270772922)}
