In [1]:
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
import string

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
import sys
drive.mount('/content/drive')
%cd "/content/drive/My Drive/Colab Notebooks/mental-health-codes"
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/mental-health-codes')
#import utilities as ut

%reload_ext autoreload
%autoreload 2

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Colab Notebooks/mental-health-codes


In [3]:
path = "Mental-Health-Twitter.csv"
df = pd.read_csv(path).drop(columns=['Unnamed: 0', "post_created", "user_id"])
df.head(5)

Unnamed: 0,post_id,post_text,followers,friends,favourites,statuses,retweets,label
0,637894677824413696,It's just over 2 years since I was diagnosed w...,84,211,251,837,0,1
1,637890384576778240,"It's Sunday, I need a break, so I'm planning t...",84,211,251,837,1,1
2,637749345908051968,Awake but tired. I need to sleep but my brain ...,84,211,251,837,0,1
3,637696421077123073,RT @SewHQ: #Retro bears make perfect gifts and...,84,211,251,837,2,1
4,637696327485366272,It’s hard to say whether packing lists are mak...,84,211,251,837,1,1


# Train-test data split

In [4]:
X = df.post_text.values
y = df.label.values

X_train, X_val, y_train, y_val =\
train_test_split(X, y, test_size=0.2, random_state=666)

<div align='center'><font size="6" color="#F39C12">Getting started with NLP-Feature Vectors</font></div>

<hr>


<p style='text-align:justify'><b>Key Objectives:</b>This notebook comes as a second part to the <b>[Getting started with NLP Notebooks](https://www.kaggle.com/parulpandey/getting-started-with-nlp-a-general-intro)</b>.In this notebook we shall study the various ways of vectorizing text data.Vectorization converts text data into feature vectors.</p>



## Importing the dataset

# Text Vectorization Methods

There are many methods to vctorize text, but in this notebook I shall discuss few of them:

## 1.Countvectorizer

The [Scikit-Learn's CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) provides a simple way to both tokenize a collection of text documents and build a vocabulary of known words, but also to encode new documents using that vocabulary.

![](https://imgur.com/xxErhnB.png)

We take a dataset and convert it into a corpus. Then we create a vocabulary of all the unique words in the corpus. Using this vocabulary, we can then  create a feature vector of the count of the words. Let's see this through a simple example. Let's say we have a corpus containing two sentences as follows

In [5]:
sentences = ['Feeling worried, even though you actually have a God who is ready to help you in any case.', 'Why is my school so different? the others have passed why do i still have assignments and exams']

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(sentences)
vectorizer.vocabulary_

{'feeling': 9,
 'worried': 27,
 'even': 7,
 'though': 23,
 'you': 28,
 'actually': 0,
 'have': 11,
 'god': 10,
 'who': 25,
 'is': 14,
 'ready': 18,
 'to': 24,
 'help': 12,
 'in': 13,
 'any': 2,
 'case': 4,
 'why': 26,
 'my': 15,
 'school': 19,
 'so': 20,
 'different': 5,
 'the': 22,
 'others': 16,
 'passed': 17,
 'do': 6,
 'still': 21,
 'assignments': 3,
 'and': 1,
 'exams': 8}

In [7]:
# Converting all the sentences to arrays
vectorizer.transform(sentences).toarray()

array([[1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0,
        0, 1, 1, 1, 0, 1, 2],
       [0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 2, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
        1, 0, 0, 0, 2, 0, 0]])

By default, a scikit learn Count vectorizer can perform the following opertions over a text corpus:

- Encoding via utf-8
- converts text to lowercase
- Tokenizes text using word level tokenization

CountVectorizer has a number of parameters. Let's look at some of them :

### 1.1 Stopword

Sometimes, some extremely common words which would appear to be of little value in helping select documents matching a user need are excluded from the vocabulary entirely. These words are called stop words. If `stop_word` parameter is specified with a list of stopwords, they will be removed from the vocabulary. Here I'll use the stopwords from NLTK but we can also specify custom stopwords too.


In [9]:
import nltk
nltk.download('stopwords')
stopwords = stopwords.words('english')

count_vectorizer = CountVectorizer(stop_words = stopwords)
count_vectorizer.fit(X_train)

train_vectors = count_vectorizer.transform(X_train)
test_vectors = count_vectorizer.transform(X_val)

train_vectors.shape

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(16000, 27756)

See how the columns have reduced from 21637 to 21498. This is because some of the stopwords were removed.

### 1.2 MIN_DF and MAX_DF parameter

`MIN_DF` lets you ignore those terms that appear rarely in a corpus. In other words, if `MIN_df`is 2, it  means that a word has to occur at least two documents to be considered useful.

`MAX_DF` on the other hand, ignores terms that have a document frequency strictly higher than the given threshold.These will be words which appear a lot of documents.

This means we can eliminate those words that are either rare or appear too frequently in a corpus.

When mentioned in absolute values i.e 1,2, etc, the value means if the word appears in 1 or 2 documents. However, when given in float, eg 30%, it means it appears in 30% of the documents.

In [10]:
count_vectorizer = CountVectorizer(stop_words = stopwords, min_df=2 ,max_df=0.8)
count_vectorizer.fit(X_train)

train_vectors = count_vectorizer.transform(X_train)
test_vectors = count_vectorizer.transform(X_val)

### 1.3.Custom Preprocesser

We can also preprocess the text by passing it as an argument to countvectorizer. The following options are avialable:

- strip_accents - This removes any accents from the text during the preprocessing step.
- lowercase -  which is default set as true but can be set to False if lowercasing isnot desired
- preprocessor - we can create our custom preprocessor and set this argument to that.



In [11]:
# Creating a custom preprocessor that lowercases, removes special characters, removes hyperlinks and punctuation

def custom_preprocessor(text):
    '''
    Make text lowercase, remove text in square brackets,remove links,remove special characters
    and remove words containing numbers.
    '''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) # remove special chars
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)

    return text

In [12]:
# World level unigrams and bigrams
count_vectorizer = CountVectorizer(stop_words="english", preprocessor=custom_preprocessor, analyzer='word',
                            ngram_range=(1, 2), max_df=1.0, min_df=1, max_features=None)

train_vectors = count_vectorizer.fit_transform(list(X_train))
test_vectors = count_vectorizer.transform(list(X_val))
list(count_vectorizer.vocabulary_)[:10]

['lehudos',
 'dis',
 'real',
 'weapon',
 'yo',
 'cosplayyy',
 'lehudos dis',
 'dis real',
 'real weapon',
 'weapon yo']

### 1.4. N-Grams and analyzer parameter

This paramneter specifies the upper and lower limit for the range of words/characters to be extracted from text. The following n-grams range stand for:
(1,1) - unigrams  eg 'United'
(1,2) - unigrams and bigrams eg - 'United', 'United States'
(2, 2)- only bigrams etc eg 'United States)


In [13]:
# character level bigrams

count_vectorizer = CountVectorizer(stop_words="english", analyzer='char_wb', preprocessor=custom_preprocessor,
                            ngram_range=(2, 2), max_df=2, min_df=2, max_features=None)

train_vectors = count_vectorizer.fit_transform(list(X_train))
test_vectors = count_vectorizer.transform(list(X_val))

print(list(count_vectorizer.vocabulary_)[:20])

['ké', 'ém', 'â ', ' 松', '松元', '元 ', ' 彰', '彰 ', ' 森', '森 ', ' 和', '和夫', '夫 ', 'lè', 'è ', ' ツ', 'ツ ', 'hé', ' 原', '原宿']


### Creating a Baseline Model using Countvectorizer

In [14]:

count_vectorizer = CountVectorizer(token_pattern=r'\w{1,}',
                   ngram_range=(1, 2), stop_words = stopwords,preprocessor=custom_preprocessor)
count_vectorizer .fit(X_train)

train_vectors = count_vectorizer.transform(X_train)
test_vectors = count_vectorizer.transform(X_val)

In [15]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0)
scores = model_selection.cross_val_score(clf, train_vectors, y_train, cv=5, scoring="f1")
scores

array([0.86811146, 0.85696633, 0.86278138, 0.86800618, 0.85910224])

In [16]:
# Fitting a simple Logistic Regression on Counts
clf.fit(train_vectors, y_train)

In [28]:
clf.score(test_vectors, y_val)

0.85325

This gets me a score of 0.80777 on the Public LB, which isn't bad with simple Logistic Regression model.

## 2.TF-IDF Vectorizer

![](https://imgur.com/J5lS7kX.png)

In the CountVectorizer, we use the counts of the words, in TFIDF we take the relative importance of that term in the entire corpus. TFIDF is composed of two words: TF and IDF.
**TF** stands for the normalized  term frequency. Term Frequency is a scoring of the frequency of the word in the current document.`TF = (Number of times term t appears in a document)/(Number of terms in the document)`

**IDF** or Inverse Document Frequency: is a scoring of how rare the word is across documents. `IDF = 1+log(N/n)`, where N is the number of documents and n is the number of documents a term t has appeared in.TF-IDF weight is often used in information retrieval and text mining. This weight is a statistical measure used to evaluate how important a word is to a document in a collection or corpus



TFIDF can be generated at word, character or even N gram level.

In [17]:
# word level
tfidf = TfidfVectorizer(analyzer='word',token_pattern=r'\w{1,}',max_features=5000)
train_tfidf = tfidf.fit_transform(X_train)
test_tfidf = tfidf.transform(X_val)

In [18]:
#ngram level
tfidf = TfidfVectorizer(analyzer='word',ngram_range=(2,3),token_pattern=r'\w{1,}',max_features=5000)
train_tfidf = tfidf.fit_transform(X_train)
test_tfidf = tfidf.transform(X_val)

In [19]:
# characters level
tfidf = TfidfVectorizer(analyzer='char',ngram_range=(2,3),token_pattern=r'\w{1,}',max_features=5000)
train_tfidf = tfidf.fit_transform(X_train)
test_tfidf = tfidf.transform(X_val)


### Creating a Baseline Model using TFIDF

In [20]:
tfidf_vectorizer = TfidfVectorizer( min_df=3,  max_features=None,analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = stopwords)

train_tfidf = tfidf.fit_transform(X_train)
test_tfidf = tfidf.transform(X_val)


In [21]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0)
scores = model_selection.cross_val_score(clf, train_tfidf, y_train, cv=5, scoring="f1")
scores

array([0.87027708, 0.85786164, 0.86464771, 0.86689851, 0.86335404])

In [22]:
# Fitting a simple Logistic Regression on TFIDF
clf.fit(train_tfidf, y_train)

## 2.Hashing Vectorizer

![](https://imgur.com/e3GRaHn.png)

Hashing Vectorizer is yet another technique for vectorizing a collection of text documents. So why do we need yet another technique when we already have so many already. Well, the reason is that,both CountVectorizer and TF-IDF result in storing the entire vocabulary dictionary in memory i.e the number of unique tokens.This could be challenging in scenarios when the token vocabulary becomes very large, to the order of millions.


In [23]:
hash_vectorizer = HashingVectorizer(n_features=10000,norm=None,alternate_sign=False)
hash_vectorizer.fit(X_train)

In [24]:
train_vectors = hash_vectorizer.transform(X_train)
test_vectors = hash_vectorizer.transform(X_val)

In [25]:
print(train_vectors[0])

  (0, 306)	1.0
  (0, 2657)	1.0
  (0, 3124)	1.0
  (0, 4930)	1.0
  (0, 5411)	1.0
  (0, 6417)	1.0
  (0, 9212)	1.0
  (0, 9988)	1.0


In [26]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0)
scores = model_selection.cross_val_score(clf, train_vectors, y_train, cv=5, scoring="f1")
scores

array([0.86418982, 0.84348641, 0.85164494, 0.85192187, 0.85356696])

In [27]:
# Fitting a simple Logistic Regression on TFIDF
clf.fit(train_vectors, y_train)