In [2]:
import msda
import process_data
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
import numpy as np
import time

import pandas as pd

In [3]:
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
all_20news = fetch_20newsgroups(subset='all',categories=categories, shuffle=True, random_state=42)
all_raw_data = all_20news.data #all the data
all_data_stringsList = process_data.createWordLists(process_data.unicodeToString(all_raw_data))
all_data_words = process_data.preprocess_by_word(all_data_stringsList)
all_labels = all_20news.target #all the labels
all_full_data = process_data.vectorize(all_data_words) #convert to bag of words


In [13]:

all_full_data[0].size

115317

In [14]:
dummy = process_data.createWordLists(process_data.unicodeToString(['hola mundo, planeta raro y luna de saturno','hola gato, te pareces a saturno. hola humano te pareces a un gato']))
#pd.DataFrame(all_labels)

In [13]:
process_data.preprocess_by_word(dummy)
process_data.vectorize(process_data.preprocess_by_word(dummy))

array([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 1, 2, 2, 2, 1, 1]])

In [18]:
#Column names are words inside each review, rows are "reviews" (there are 3759 reviews)
pd.DataFrame(all_full_data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,115307,115308,115309,115310,115311,115312,115313,115314,115315,115316
0,1,2,2,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3754,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3755,1,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3756,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3757,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
all_full_data = all_full_data.transpose() #so rows are data, columns are features (format we predominantly use)
num_mostCommon = 700
all_mostCommonFeatures_data = process_data.getMostCommonFeatures(all_full_data, num_mostCommon)
train_data, train_labels, test_data, test_labels = process_data.splitTrainTest(all_mostCommonFeatures_data, all_labels)

In [23]:
pd.DataFrame(test_data)#rows are features(words)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,130,131,132,133,134,135,136,137,138,139
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,1,1,1,0,0,1,0,1,1,...,1,1,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115312,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
115313,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
115314,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
115315,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
print("Shape of training data (rows are words): ", train_data.shape)
print("Shape of test data: (rows are words)", test_data.shape)

In [None]:
#classify with linear SVM
#transpose because sklearn requires (#data x #features)
clf_baseline = svm.SVC().fit(train_data.transpose(), train_labels)
baseline_preds = clf_baseline.predict(test_data.transpose())
base_accuracy = np.mean(baseline_preds == test_labels)
print("Accuracy with linear SVM on basic representation: ", base_accuracy)

In [None]:
before_msda = time.time()
#learn deep representation with msda...
prob_corruption = 0.4
num_layers = 1
subproblem_size = 400

#need to transpose data to be in the right format (#features x #data) for mSDA
#specifically, the deep representation is the output from the last layer
#with low dimensional approximation described in paper
'''
subproblem_mappings, subseq_mappings, representations  = msda.mSDA_lowDimApprox(train_data, prob_corruption,
num_layers, subproblem_size)
train_deepRep = representations[:,:,-1]
#use same weights as on training features to transform test data
test_deepRep = msda.mSDA_lowDimApprox(test_data, prob_corruption, num_layers, subproblem_size, 
subproblem_mappings, subseq_mappings)[2][:,:,-1]
'''
#without low dimensional approximation
train_mappings, train_reps = msda.mSDA(train_data, prob_corruption, num_layers)


In [None]:
train_deepRep = train_reps[:][:][-1]
#use same weights as on training features to transform test data
test_deepRep = msda.mSDA(test_data, prob_corruption, num_layers, train_mappings)[1][:][:][-1]
#'''

In [None]:
#pd.DataFrame(test_deepRep)

In [None]:
#sklearn requires (#data x #features) so transpose back
train_deepRep = train_deepRep.transpose()
test_deepRep = test_deepRep.transpose()

after_msda = time.time()
print(("used msda in %s seconds" % (after_msda - before_msda)))
print("Shape of msda train rep: ", train_deepRep.shape)
print("Shape of msda test rep: ", test_deepRep.shape)

#...and classify with linear SVM
clf_deepRep = svm.SVC().fit(train_deepRep, train_labels)
preds_with_deepRep = clf_deepRep.predict(test_deepRep)
deep_accuracy = np.mean(preds_with_deepRep == test_labels)
print("Accuracy with linear SVM on mSDA features: ", deep_accuracy)

# SVM training with original and encoded features

In [None]:
#...and classify with linear SVM

h_train = np.concatenate((train_data.T,train_deepRep),axis=-1)
h_test = np.concatenate((test_data.T, test_deepRep),axis=-1)

clf_deepRep = svm.SVC().fit( h_train, train_labels)
preds_with_deepRep = clf_deepRep.predict(h_test)
deep_accuracy = np.mean(preds_with_deepRep == test_labels)
print("Accuracy with linear SVM on mSDA features: ", deep_accuracy)

# Test with nltk

In [1]:
import nltk
#nltk.download('punkt')
#nltk.download('stopwords')

In [6]:
from nltk import word_tokenize
from nltk.corpus import stopwords

In [7]:
sentence = ["El problema del matrimonio matrimonio es que se acaba todas las noches despues de hacer el amor", "y hay que volver a reconstruirlo todas las mananas antes del desayuno."]

In [8]:
#from sklearn.feature_extraction.text import HashingVectorizer
#from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
spanish_stopwords = stopwords.words('spanish')
vectorizer = CountVectorizer(stop_words = spanish_stopwords)
X =  vectorizer.fit_transform(sentence)
print(X.toarray())

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/francisco.revuelta/nltk_data'
    - '/Users/francisco.revuelta/miniconda3/envs/NLP/nltk_data'
    - '/Users/francisco.revuelta/miniconda3/envs/NLP/share/nltk_data'
    - '/Users/francisco.revuelta/miniconda3/envs/NLP/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
