# TF-IDF

In [1]:
import nltk

In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
paragraph = '''In a country like India with a galloping population, unfortunately nobody is paying attention to the issue 
of population. Political parties are feeling shy, politicians are feeling shy, Parliament also does not adequately discuss 
about the issue,” said Naidu while addressing the 58th convocation of Indian Agricultural Research Institute (IARI).

He said, “You know how population is growing, creating problems. See the problems in Delhi, traffic, more human beings, 
more vehicles, more tension, less attention. If you have tension you cannot pay attention.” 
Emphasising on the need to increase food production to meet demand of growing population, Naidu said, 
“In future if population increases like this, and you are not able to adequately match it with increase in production, 
there will be problem'''

In [4]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [5]:
ps = PorterStemmer()

In [6]:
wordnet = WordNetLemmatizer()

In [7]:
sentences = nltk.sent_tokenize(paragraph)

In [8]:
corpus = []
for i in range(len(sentences)):
    review = re.sub("[^a-zA-Z]", ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus).toarray()

In [10]:
X.shape

(5, 53)

In [11]:
type(X)

numpy.ndarray

In [12]:
X

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.21677716, 0.        , 0.        , 0.        , 0.32368731,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.32368731, 0.        ,
        0.        , 0.        , 0.        , 0.32368731, 0.        ,
        0.        , 0.26114888, 0.        , 0.        , 0.26114888,
        0.        , 0.        , 0.        , 0.        , 0.32368731,
        0.        , 0.        , 0.        , 0.32368731, 0.        ,
        0.        , 0.43355432, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.32368731, 0.        ],
       [0.        , 0.20243885, 0.16332639, 0.20243885, 0.20243885,
        0.        , 0.        , 0.        , 0.20243885, 0.        ,
        0.        , 0.        , 0.        , 0.20243885, 0.        ,
        0.4048777 , 0.        , 0.        , 0.        , 0.        ,
   

In [13]:
print(X[:,0])

[0.         0.         0.         0.         0.18444605]


In [14]:
print(X[:,0:5])

[[0.         0.         0.         0.         0.        ]
 [0.         0.20243885 0.16332639 0.20243885 0.20243885]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.18444605 0.         0.14880991 0.         0.        ]]


# TF-IDF

### Draw back of Bag of Words

In [15]:
# All the words have given same importance
# No Semantic information preserved
# For above two problems TF-IDF model is the solution

# Steps in TF-IDF

In [16]:
# 1. Lower case the corpus or paragraph.
# 2. Tokenization.
# 3. TF: Term Frequency, IDF: Inverse Document Frequency, TF-IDF = TF*log(IDF).
# 4. TF = No. of occurance of a word in a document / No. of words in that document.
# 5. IDF = log(No. of documents/No. of documents containing the word)
# 6. TFIDF(word) = TF(Document, word) * IDF (word)

In [17]:
import nltk

In [18]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [19]:
paragraph = '''In a country like India with a galloping population, unfortunately nobody is paying attention to the issue 
of population. Political parties are feeling shy, politicians are feeling shy, Parliament also does not adequately discuss 
about the issue,” said Naidu while addressing the 58th convocation of Indian Agricultural Research Institute (IARI).

He said, “You know how population is growing, creating problems. See the problems in Delhi, traffic, more human beings, 
more vehicles, more tension, less attention. If you have tension you cannot pay attention.” 
Emphasising on the need to increase food production to meet demand of growing population, Naidu said, 
“In future if population increases like this, and you are not able to adequately match it with increase in production, 
there will be problem'''

In [20]:
# Cleaning the Text

In [21]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [22]:
ps = PorterStemmer()

In [23]:
wordnet = WordNetLemmatizer()

In [24]:
sentences = nltk.sent_tokenize(paragraph)

In [25]:
# sentences

In [26]:
corpus = []

In [27]:
for i in range(len(sentences)):
    review = re.sub("[^a-zA-Z]", ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [28]:
# Creatung the TF-IDF Model

In [29]:
# # Creating the TF-IDF model
# from sklearn.feature_extraction.text import TfidfVectorizer
# cv = TfidfVectorizer()
# X = cv.fit_transform(corpus).toarray()

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus)

In [31]:
X.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.21677716, 0.        , 0.        , 0.        , 0.32368731,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.32368731, 0.        ,
        0.        , 0.        , 0.        , 0.32368731, 0.        ,
        0.        , 0.26114888, 0.        , 0.        , 0.26114888,
        0.        , 0.        , 0.        , 0.        , 0.32368731,
        0.        , 0.        , 0.        , 0.32368731, 0.        ,
        0.        , 0.43355432, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.32368731, 0.        ],
       [0.        , 0.20243885, 0.16332639, 0.20243885, 0.20243885,
        0.        , 0.        , 0.        , 0.20243885, 0.        ,
        0.        , 0.        , 0.        , 0.20243885, 0.        ,
        0.4048777 , 0.        , 0.        , 0.        , 0.        ,
   

In [32]:
type(X)

scipy.sparse.csr.csr_matrix

In [33]:
X.shape

(5, 53)

In [34]:
print(X[:,0])

  (4, 0)	0.18444604729119288


In [35]:
print(X[:,:])

  (0, 26)	0.2611488808945384
  (0, 5)	0.21677716168619507
  (0, 38)	0.3236873066380182
  (0, 34)	0.3236873066380182
  (0, 51)	0.3236873066380182
  (0, 41)	0.43355432337239014
  (0, 18)	0.3236873066380182
  (0, 23)	0.3236873066380182
  (0, 29)	0.2611488808945384
  (0, 9)	0.3236873066380182
  (1, 21)	0.20243884765910772
  (1, 25)	0.20243884765910772
  (1, 44)	0.20243884765910772
  (1, 3)	0.20243884765910772
  (1, 24)	0.20243884765910772
  (1, 8)	0.20243884765910772
  (1, 49)	0.20243884765910772
  (1, 1)	0.20243884765910772
  (1, 32)	0.16332638763273197
  (1, 45)	0.13557565561148596
  (1, 13)	0.20243884765910772
  (1, 2)	0.16332638763273197
  (1, 4)	0.20243884765910772
  (1, 35)	0.20243884765910772
  (1, 40)	0.20243884765910772
  :	:
  (3, 11)	0.3420339209721722
  (3, 46)	0.3420339209721722
  (3, 42)	0.2290641031273583
  (3, 5)	0.2290641031273583
  (4, 30)	0.18444604729119288
  (4, 0)	0.18444604729119288
  (4, 17)	0.18444604729119288
  (4, 12)	0.18444604729119288
  (4, 31)	0.1844460472911