# DSCI 614 Text Mining
# Project 4: The Bag of Words and TF-IDF Model
## By Tony Nguyen


### 1. The Bag of Words and TF-IDF Model on Movie Reviews Dataset

In [None]:
import pandas as pd
twitter_data = pd.read_csv('./Twitter_Data.csv')
twitter_data.head(10)

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
5,kiya tho refresh maarkefir comment karo,0.0
6,surat women perform yagna seeks divine grace f...,0.0
7,this comes from cabinet which has scholars lik...,0.0
8,with upcoming election india saga going import...,1.0
9,gandhi was gay does modi,1.0



### 2. Convert the column of the clean_text to a matrix of token counts using CountVectorizer and unigrams and bigrams.


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
# Create a Vectorizer Object
vectorizer = CountVectorizer()

# Convert a collection of text documents to a matrix of token counts
# Learn the vocabulary dictionary and return document-term matrix.
matrix = vectorizer.fit_transform(twitter_data['clean_text'].values.astype('U'))
  
# Summarizing the numerical features from texts
print(f'The size of the feature matrix for the texts = {matrix.get_shape()}')
print(f'The first row of the feature matrix = {matrix[0, ]}.')
print(f'There are {matrix[0, ].count_nonzero()}/{matrix.get_shape()[1]} non-zeros')

The size of the feature matrix for the texts = (162980, 106925)
The first row of the feature matrix =   (0, 103780)	1
  (0, 62481)	1
  (0, 76937)	1
  (0, 61637)	1
  (0, 40527)	1
  (0, 60317)	1
  (0, 40499)	1
  (0, 34701)	1
  (0, 43980)	1
  (0, 13684)	1
  (0, 95482)	2
  (0, 29341)	1
  (0, 51357)	1
  (0, 80438)	1
  (0, 91104)	2
  (0, 103994)	1
  (0, 30477)	1
  (0, 93828)	1
  (0, 105521)	1
  (0, 39396)	1
  (0, 51985)	1
  (0, 87792)	2
  (0, 8389)	3
  (0, 67998)	1
  (0, 17907)	1
  (0, 34636)	1
  (0, 77543)	1
  (0, 94774)	1.
There are 28/106925 non-zeros


In [3]:
import numpy as np
# Create a Vectorizer Object using 1-grams and 2-grams
vectorizer = CountVectorizer(ngram_range=(1, 2))

# Encode the corpus
# Extract token counts out of raw text documents using the vocabulary
matrix = vectorizer.fit_transform(twitter_data['clean_text'].values.astype('U'))
  
# Summarizing the numerical features from texts
print(f'The size of the feature matrix for the texts = {matrix.get_shape()}')
print(f'The first row of the feature matrix = {matrix[0, ]}.')
print(f'There are {matrix[0, ].count_nonzero()}/{matrix.get_shape()[1]} non-zeros')

The size of the feature matrix for the texts = (162980, 1199726)
The first row of the feature matrix =   (0, 1145440)	1
  (0, 666553)	1
  (0, 831879)	1
  (0, 658439)	1
  (0, 435499)	1
  (0, 644088)	1
  (0, 435147)	1
  (0, 357405)	1
  (0, 480830)	1
  (0, 134189)	1
  (0, 1029272)	2
  (0, 299531)	1
  (0, 554530)	1
  (0, 867040)	1
  (0, 976966)	2
  (0, 1155022)	1
  (0, 308537)	1
  (0, 1006650)	1
  (0, 1183134)	1
  (0, 419834)	1
  (0, 562994)	1
  (0, 940186)	2
  (0, 66073)	3
  (0, 728515)	1
  (0, 175799)	1
  :	:
  (0, 357481)	1
  (0, 481028)	1
  (0, 134240)	1
  (0, 1032237)	1
  (0, 299593)	1
  (0, 555045)	1
  (0, 867045)	1
  (0, 1038737)	1
  (0, 977642)	1
  (0, 1155443)	1
  (0, 309052)	1
  (0, 1007448)	1
  (0, 1183662)	1
  (0, 420543)	1
  (0, 563133)	1
  (0, 977518)	1
  (0, 940260)	1
  (0, 72487)	1
  (0, 729194)	1
  (0, 175815)	1
  (0, 74556)	1
  (0, 940632)	1
  (0, 356855)	1
  (0, 838915)	1
  (0, 75386)	1.
There are 60/1199726 non-zeros



### 3. Perform the tf-idf analasys on the column of the clean_text using CountVectorizer and TfidfTransformer

In [4]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Create a Vectorizer Object using default parameters
vectorizer = CountVectorizer()

# Convert a collection of text documents to a matrix of token counts
# Extract token counts out of raw text documents using the vocabulary
token_count_matrix=vectorizer.fit_transform(twitter_data['clean_text'].values.astype('U'))
print(f'The size of the count matrix for the texts = {token_count_matrix.get_shape()}')
print(f'The sparse count matrix is as follows:')
print(token_count_matrix)

# Create a tf_idf object using default parameters
tf_idf_transformer=TfidfTransformer(use_idf=True, smooth_idf=True, sublinear_tf=False) 

# Fit to the count matrix, then transform it to a normalized tf-idf representation
tf_idf_matrix = tf_idf_transformer.fit_transform(token_count_matrix)

print(f'The size of the tf_idf matrix for the texts = {tf_idf_matrix.get_shape()}')
print(f'The sparse tf_idf matrix is as follows:')
print(tf_idf_matrix)

The size of the count matrix for the texts = (162980, 106925)
The sparse count matrix is as follows:
  (0, 103780)	1
  (0, 62481)	1
  (0, 76937)	1
  (0, 61637)	1
  (0, 40527)	1
  (0, 60317)	1
  (0, 40499)	1
  (0, 34701)	1
  (0, 43980)	1
  (0, 13684)	1
  (0, 95482)	2
  (0, 29341)	1
  (0, 51357)	1
  (0, 80438)	1
  (0, 91104)	2
  (0, 103994)	1
  (0, 30477)	1
  (0, 93828)	1
  (0, 105521)	1
  (0, 39396)	1
  (0, 51985)	1
  (0, 87792)	2
  (0, 8389)	3
  (0, 67998)	1
  (0, 17907)	1
  :	:
  (162979, 65873)	1
  (162979, 56603)	1
  (162979, 95375)	1
  (162979, 63947)	1
  (162979, 5841)	1
  (162979, 5191)	1
  (162979, 74812)	1
  (162979, 17962)	1
  (162979, 69383)	1
  (162979, 43151)	1
  (162979, 103788)	1
  (162979, 47231)	1
  (162979, 82981)	2
  (162979, 34101)	1
  (162979, 34124)	1
  (162979, 89693)	1
  (162979, 77339)	1
  (162979, 10864)	2
  (162979, 44215)	1
  (162979, 25873)	1
  (162979, 56816)	1
  (162979, 58707)	1
  (162979, 29767)	1
  (162979, 58706)	1
  (162979, 41683)	1
The size of the t


### 4. Perform the tf-idf analysis on the column of the clean_text using Tfidfvectorizer

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer Object using default parameters: use_idf=True, smooth_idf=True, sublinear_tf=False
tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=False)

# Fit to the corpus, then convert a collection of raw documents to a matrix of TF-IDF features.
tf_idf_matrix = tfidf_vectorizer.fit_transform(twitter_data['clean_text'].values.astype('U'))

print(f'The size of the tf_idf matrix for the texts = {tf_idf_matrix.get_shape()}')
print(f'The sparse tf_idf matrix is as follows:')
print(tf_idf_matrix)

The size of the tf_idf matrix for the texts = (162980, 106925)
The sparse tf_idf matrix is as follows:
  (0, 94774)	0.23660466349027023
  (0, 77543)	0.2644440348083726
  (0, 34636)	0.2517650765565202
  (0, 17907)	0.18097741396817904
  (0, 67998)	0.08143445561404204
  (0, 8389)	0.1858717788592678
  (0, 87792)	0.2387429732629813
  (0, 51985)	0.2030659320709923
  (0, 39396)	0.12597803421480075
  (0, 105521)	0.12028353756840601
  (0, 93828)	0.13399957155223954
  (0, 30477)	0.1452398919187344
  (0, 103994)	0.10858429770134567
  (0, 91104)	0.3105076608504044
  (0, 80438)	0.3147731549491533
  (0, 51357)	0.15495537757250963
  (0, 29341)	0.20379916078250251
  (0, 95482)	0.1103369330198538
  (0, 13684)	0.2282834313221842
  (0, 43980)	0.11613399134304461
  (0, 34701)	0.20196161088022185
  (0, 40499)	0.1921722923035772
  (0, 60317)	0.21686927892530608
  (0, 40527)	0.12554675089481607
  (0, 61637)	0.1889103368727531
  :	:
  (162979, 10864)	0.3364426864094962
  (162979, 77339)	0.1506084640568452
  (


### 5. Perform the tf-idf analysis on the column of the clean_text using HashingVectorizer and TfidfTransformer

In [6]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Create a Vectorizer Object using default parameters
hash_vectorizer = HashingVectorizer()

# Convert a collection of text documents to a matrix of token counts
token_count_matrix=hash_vectorizer.fit_transform(twitter_data['clean_text'].values.astype('U'))
print(f'The size of the count matrix for the texts = {token_count_matrix.get_shape()}')
print(f'The sparse count matrix is as follows:')
print(token_count_matrix)

# Create a tf_idf object using default parameters
tf_idf_transformer=TfidfTransformer(use_idf=True, smooth_idf=True, sublinear_tf=False) 

# Fit to the count matrix, then transform it to a normalized tf-idf representation
tf_idf_matrix = tf_idf_transformer.fit_transform(token_count_matrix)

print(f'The size of the tf_idf matrix for the texts = {tf_idf_matrix.get_shape()}')
print(f'The sparse tf_idf matrix is as follows:')
print(tf_idf_matrix)

The size of the count matrix for the texts = (162980, 1048576)
The sparse count matrix is as follows:
  (0, 160541)	0.14907119849998599
  (0, 168557)	0.14907119849998599
  (0, 180525)	-0.4472135954999579
  (0, 232512)	0.14907119849998599
  (0, 263274)	0.14907119849998599
  (0, 277794)	-0.14907119849998599
  (0, 286878)	-0.29814239699997197
  (0, 288398)	0.14907119849998599
  (0, 360502)	0.29814239699997197
  (0, 387101)	-0.14907119849998599
  (0, 433698)	0.14907119849998599
  (0, 434864)	0.14907119849998599
  (0, 449993)	-0.14907119849998599
  (0, 465141)	-0.14907119849998599
  (0, 482215)	-0.14907119849998599
  (0, 484920)	-0.14907119849998599
  (0, 490370)	0.29814239699997197
  (0, 522187)	0.14907119849998599
  (0, 614924)	0.14907119849998599
  (0, 646934)	0.14907119849998599
  (0, 747378)	-0.14907119849998599
  (0, 748718)	0.14907119849998599
  (0, 808196)	-0.14907119849998599
  (0, 839641)	-0.14907119849998599
  (0, 865698)	0.14907119849998599
  :	:
  (162979, 257965)	0.16222142113