### Text Classification with various embedding
12/24/2020 Thu  
- nlp preprocessing 
    - Tokenize, removing stop words, stemming, lemmatization 
- Model
    - Bag of words(tf-idf)
    - Word2Vec
    - Bert
    
- Reference: https://towardsdatascience.com/text-classification-with-nlp-tf-idf-vs-word2vec-vs-bert-41ff868d1794

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import re
import nltk


### Read Data

In [3]:
# read data
reuters_train = pd.read_csv('result/reuters_final_train.csv')
reuters_test = pd.read_csv('result/reuters_final_test.csv')

In [4]:
# drop nan
reuters_train = reuters_train.dropna()
reuters_test = reuters_test.dropna()

In [5]:
reuters_train.head()

Unnamed: 0,id,topics,texts
0,4005,interest,u.s. economic data key to debt futures outlook...
1,4005,retail,u.s. economic data key to debt futures outlook...
2,4005,ipi,u.s. economic data key to debt futures outlook...
3,4012,earn,bank of british columbia 1st qtr jan 31 netope...
4,4014,earn,restaurant associates inc <ra> 4th qtr jan 3sh...


### NLP Preprocess

In [2]:
stopwords = nltk.corpus.stopwords.words("english")
porterstem = nltk.stem.porter.PorterStemmer()
lemmatization = nltk.stem.wordnet.WordNetLemmatizer()

def nlp_preprocess(text):
    # convert to lowercase and remove punctuations and characters and strip
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    # Tokenize 
    tokens = text.split()
    
    # Remove Stopwords
    tokens = [word for word in tokens if word not in stopwords]
                
    # Stemming
    tokens = [porterstem.stem(word) for word in tokens]
                
    # lemmatization
    tokens = [lemmatization.lemmatize(word) for word in tokens]
    
    text = " ".join(tokens)
    return text

In [7]:
reuters_train['texts_clean'] = reuters_train['texts'].apply(lambda x:nlp_preprocess(x))
reuters_test['texts_clean'] = reuters_test['texts'].apply(lambda x:nlp_preprocess(x))


### Bag of Words

In [8]:
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing

In [None]:
# vector_reuters_train = vectorizer.fit_transform(reuters_train['texts'])
# vector_reuters_test = vectorizer.transform(reuters_test['texts'])

In [15]:
vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2))
vectorizer.fit(reuters_train['texts_clean'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [17]:
vector_reuters_train = vectorizer.transform(reuters_train['texts_clean'])

In [18]:
sns.heatmap(vector_reuters_train.todense()[:,np.random.randint(0,X.shape[1],100)]==0, vmin=0, vmax=1, cbar=False).set_title('Sparse Matrix Sample')



NameError: name 'X' is not defined