# 4 Vectoring
## Count vectorization
Create a document-term matrix where the entry of each cell will be a count of the number of times that word occurred in that document.
### Read in text

In [45]:
import pandas as pd
import re
import string
import nltk
pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()

data = pd.read_csv('SMSSpamCollection', sep='\t')
data.columns = ['label', 'body_text']

data.head()

Unnamed: 0,label,body_text
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives around here though"
4,spam,FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for ...


### Create function to remove punctuation,  tokenize, remove stopwords, and lemmatize

In [46]:
def clean_text(text):
    text = ''.join([char.lower() for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    tokens = re.findall('\w+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return text

### Apply CountVectorizer

In [47]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
X_counts = count_vect.fit_transform(data['body_text'])
print(X_counts.shape)
print(count_vect.get_feature_names()[0:500:25])

(5571, 7969)
['aa', 'aburo', 'ache', 'addamsfa', 'adventure', 'ag', 'ahthe', 'aldrine', 'alls', 'ami', 'andre', 'answerin', 'apartment', 'approach', 'aretaking', 'arrive', 'asks', 'atrocious', 'auntie', 'await']


### Apply CountVectorizer to smaller sample

In [48]:
data_sample = data[0:15]

count_vect_sample = CountVectorizer(analyzer=clean_text)
X_counts_sample = count_vect_sample.fit_transform(data_sample['body_text'])
print(X_counts_sample.shape)
print(count_vect_sample.get_feature_names()[0:100:5])

(15, 155)
['aid', 'back', 'call', 'chance', 'code', 'credit', 'darling', 'early', 'fa', 'fulfil', 'help', 'hour', 'ive', 'lar', 'like', 'message', 'network', 'oru', 'pobox', 'questionstd']


### Vectorziers output sparse matrices
**Sparse Matrix**: A matrix in which most entries are 0. In the interest of efficient storage, a sparse matrix will be stored by only storing the locations of the non-zero elements.

In [49]:
X_counts_sample

<15x155 sparse matrix of type '<class 'numpy.int64'>'
	with 175 stored elements in Compressed Sparse Row format>

In [50]:
X_counts_df = pd.DataFrame(X_counts_sample.toarray())
X_counts_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,145,146,147,148,149,150,151,152,153,154
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0


In [51]:
X_counts_df.columns = count_vect_sample.get_feature_names()
X_counts_df.head()

Unnamed: 0,aid,already,anymore,apply,around,back,blessing,breather,brother,c,...,win,winner,wkly,wonderful,wont,word,wwwdbuknet,xxx,xxxmobilemovieclub,xxxmobilemovieclubcomnqjkgighjjgcbl
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0


## N-Grams
Creates a document-term matrix where counts still occupy the cell but instead of the columns representing single terms, they represent all combinations of adjacent words of length *n* in text. Google's auto complete uses an n-grams like approach.

"data analysis is great"

| n | Name      | Tokens                                                         |
|---|-----------|----------------------------------------------------------------|
| 2 | bigram    | ['data analysis', 'analysis is', 'is great']      |
| 3 | trigram   | ['data analysis is', 'analysis is great'] |
| 4 | four-gram | ['data analysis is great']    |

In [52]:
def clean_text_join(text):
    text = ''.join([char.lower() for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    tokens = re.findall('\w+', text)
    text = ' '.join([wn.lemmatize(word) for word in tokens if word not in stopwords])
    return text

data['cleaned_text'] = data['body_text'].apply(lambda x: clean_text_join(x))
data.head()

Unnamed: 0,label,body_text,cleaned_text
0,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,free entry wkly comp win fa cup final tkts st may text fa receive entry questionstd txt ratetcs ...
2,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
3,ham,"Nah I don't think he goes to usf, he lives around here though",nah dont think go usf life around though
4,spam,FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for ...,freemsg hey darling week word back id like fun still tb ok xxx std chgs send rcv


### Apply CountVectorizer (w/ N-Grams)

In [53]:
ngram_vect = CountVectorizer(ngram_range=(2,2))
X_counts = ngram_vect.fit_transform(data['cleaned_text'])
print(X_counts.shape)
print(ngram_vect.get_feature_names()[0:5000:250])

(5571, 30325)
['aa exhaust', 'aeroplane aftr', 'almost ltgt', 'amp eve', 'anywhere damn', 'ask co', 'awarded city', 'back yo', 'beeen muht', 'birthday loving', 'bottle wine', 'bstfrnd rply', 'call charge', 'camera free', 'case man', 'chat luv', 'claim number', 'colleague wish', 'common car', 'cool shall']


### Apply CountVectorizer (w/ N-Grams) to smaller sample

In [54]:
data_sample = data[0:15]

ngram_vect_sample = CountVectorizer(ngram_range=(2,2))
X_counts_sample = ngram_vect_sample.fit_transform(data_sample['cleaned_text'])
print(X_counts_sample.shape)
print(ngram_vect_sample.get_feature_names()[0:100:10])

(15, 167)
['aid patent', 'call claim', 'claim tc', 'cried enough', 'early hor', 'free entry', 'hey darling', 'joking wif', 'may text', 'nah dont']


### Output sparse matrices

In [55]:
X_counts_df = pd.DataFrame(X_counts_sample.toarray())
X_counts_df.columns = ngram_vect_sample.get_feature_names()
X_counts_df.head()

Unnamed: 0,aid patent,already say,anymore tonight,apply over,apply reply,around though,back id,blessing time,breather promise,brother like,...,winner valued,wkly comp,wonderful blessing,wont take,word back,word claim,word thank,wwwdbuknet lccltd,xxx std,xxxmobilemovieclub use
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


## TF-IDF
### Equation
$$w_{i,j}=tf_{i,j}\times\log{\frac{N}{df_i}}$$

$tf_{i,j}=$ number of times $i$ occurs in $j$ divided by total number of terms in $j$

$df_i=$ number of documents containing $i$

$N=$ total number of documents
### Example
"data analysis is great"

$tf_{\text{'data'},j}= \frac{\text{# of occurences of 'data'}}{\text{number of words in text message}} = \frac{1}{4} = 0.25$

$df_\text{'data'}= 2$

$N=20$

$w_{\text{'data'},j}=tf_{i,j}\times\log{\frac{N}{df_i}}$

$w_{\text{'data'},j}=0.25\times\log{\frac{20}{2}}$

$w_{\text{'data'},j}=0.25\times 1$

$w_{\text{'data'},j}=0.25$

### Apply TfidfVectorizer

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
print(X_tfidf.shape)
print(tfidf_vect.get_feature_names()[0:5000:500])

(5571, 7969)
['aa', 'babyjontet', 'cardiff', 'cudnt', 'eight', 'fring', 'hit', 'juan', 'made', 'netno']


### Apply TfidfVectorizer to smaller sample

In [58]:
data_sample = data[0:15]

tfidf_vect_sample = TfidfVectorizer(analyzer=clean_text)
X_tfidf_sample = tfidf_vect_sample.fit_transform(data_sample['body_text'])
print(X_tfidf_sample.shape)
print(tfidf_vect_sample.get_feature_names()[0:100:10])

(15, 155)
['aid', 'call', 'code', 'darling', 'fa', 'help', 'ive', 'like', 'network', 'pobox']


### Output sparse matrices

In [59]:
X_tfidf_df = pd.DataFrame(X_tfidf_sample.toarray())
X_tfidf_df.columns = tfidf_vect_sample.get_feature_names()
X_tfidf_df.head()

Unnamed: 0,aid,already,anymore,apply,around,back,blessing,breather,brother,c,...,win,winner,wkly,wonderful,wont,word,wwwdbuknet,xxx,xxxmobilemovieclub,xxxmobilemovieclubcomnqjkgighjjgcbl
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.182647,0.0,0.0,0.0,0.0,0.0,0.0,...,0.182647,0.0,0.210343,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.296149,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.296149,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.359118,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.253051,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.196092,0.0,0.253051,0.0,0.0
