In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from textblob import Word
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Sample corpus
documents = ['Machine learning is the study of computer algorithms that improve automatically through experience.\
             Machine learning algorithms build a mathematical model based on sample data, known as training data.\
             The discipline of machine learning employs various approaches to teach computers to accomplish tasks \
             where no fully satisfactory algorithm is available.',
             'Machine learning is closely related to computational statistics, which focuses on making predictions using computers.\
             The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning.',
             'Machine learning involves computers discovering how they can perform tasks without being explicitly programmed to do so. \
             It involves computers learning from data provided so that they carry out certain tasks.',
             'Machine learning approaches are traditionally divided into three broad categories, depending on the nature of the "signal"\
             or "feedback" available to the learning system: Supervised, Unsupervised and Reinforcement'
]

In [25]:
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.max_columns', 0)

In [4]:
documents_df=pd.DataFrame(documents,columns=['documents'])

### Corpus Dataframe

In [5]:
documents_df

Unnamed: 0,documents
0,"Machine learning is the study of computer algorithms that improve automatically through experience. Machine learning algorithms build a mathematical model based on sample data, known as training data. The discipline of machine learning employs various approaches to teach computers to accomplish tasks where no fully satisfactory algorithm is available."
1,"Machine learning is closely related to computational statistics, which focuses on making predictions using computers. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning."
2,Machine learning involves computers discovering how they can perform tasks without being explicitly programmed to do so. It involves computers learning from data provided so that they carry out certain tasks.
3,"Machine learning approaches are traditionally divided into three broad categories, depending on the nature of the ""signal"" or ""feedback"" available to the learning system: Supervised, Unsupervised and Reinforcement"


In [6]:
N=documents_df.shape[0]

### Some preprocessing of text

In [7]:
documents_df['documents'] = documents_df['documents'].apply(lambda x: " ".join(re.sub('[^a-zA-Z \n]', ' ', x).lower() for x in x.split()))
stop = stopwords.words('english')
documents_df['documents'] = documents_df['documents'].apply(lambda x: " ".join(Word(x).lemmatize() for x in x.split() if x not in stop))

### Documents post preprocessing

In [8]:
documents_df

Unnamed: 0,documents
0,machine learning study computer algorithm improve automatically experience machine learning algorithm build mathematical model based sample data known training data discipline machine learning employ various approach teach computer accomplish task fully satisfactory algorithm available
1,machine learning closely related computational statistic focus making prediction using computer study mathematical optimization delivers method theory application domain field machine learning
2,machine learning involves computer discovering perform task without explicitly programmed involves computer learning data provided carry certain task
3,machine learning approach traditionally divided three broad category depending nature signal feedback available learning system supervised unsupervised reinforcement


In [9]:
count_vectorizer=CountVectorizer()

In [10]:
count_vectors=count_vectorizer.fit_transform(documents_df['documents'])

In [11]:
count_vectors.toarray()

array([[1, 3, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 2, 2, 0, 0, 1, 0, 0, 0,
        1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 3, 3, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 2, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 2, 1, 0, 0, 0, 1, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 2, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0]])

### tf-Term Frequency vectors

In [26]:
pd.DataFrame(count_vectors.toarray(),columns=[count_vectorizer.get_feature_names()])

Unnamed: 0,accomplish,algorithm,application,approach,automatically,available,based,broad,build,carry,category,certain,closely,computational,computer,data,delivers,depending,discipline,discovering,divided,domain,employ,experience,explicitly,feedback,field,focus,fully,improve,involves,known,learning,machine,making,mathematical,method,model,nature,optimization,perform,prediction,programmed,provided,reinforcement,related,sample,satisfactory,signal,statistic,study,supervised,system,task,teach,theory,three,traditionally,training,unsupervised,using,various,without
0,1,3,0,1,1,1,1,0,1,0,0,0,0,0,2,2,0,0,1,0,0,0,1,1,0,0,0,0,1,1,0,1,3,3,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0,0,1,0
1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,2,2,1,1,1,0,0,1,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,1,0,1,0,0,2,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,2,0,2,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,2,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,1,1,0,1,0,0,0


In [27]:
# calculating document frequencies from term frequencies
document_frequencies=np.sum(np.where(count_vectors.toarray()>=1,1,0),axis=0)

In [14]:
document_frequencies

array([1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1])

### idf-Inverse document frequencies

In [28]:
pd.DataFrame((np.log(N/document_frequencies)).reshape(1,-1),columns=[count_vectorizer.get_feature_names()])

Unnamed: 0,accomplish,algorithm,application,approach,automatically,available,based,broad,build,carry,category,certain,closely,computational,computer,data,delivers,depending,discipline,discovering,divided,domain,employ,experience,explicitly,feedback,field,focus,fully,improve,involves,known,learning,machine,making,mathematical,method,model,nature,optimization,perform,prediction,programmed,provided,reinforcement,related,sample,satisfactory,signal,statistic,study,supervised,system,task,teach,theory,three,traditionally,training,unsupervised,using,various,without
0,1.386294,1.386294,1.386294,0.693147,1.386294,0.693147,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,0.287682,0.693147,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,0.0,0.0,1.386294,0.693147,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,0.693147,1.386294,1.386294,0.693147,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294,1.386294


### tf-idf vectors

In [29]:
pd.DataFrame(count_vectors.toarray()*(np.log(N/document_frequencies)),columns=[count_vectorizer.get_feature_names()])

Unnamed: 0,accomplish,algorithm,application,approach,automatically,available,based,broad,build,carry,category,certain,closely,computational,computer,data,delivers,depending,discipline,discovering,divided,domain,employ,experience,explicitly,feedback,field,focus,fully,improve,involves,known,learning,machine,making,mathematical,method,model,nature,optimization,perform,prediction,programmed,provided,reinforcement,related,sample,satisfactory,signal,statistic,study,supervised,system,task,teach,theory,three,traditionally,training,unsupervised,using,various,without
0,1.386294,4.158883,0.0,0.693147,1.386294,0.693147,1.386294,0.0,1.386294,0.0,0.0,0.0,0.0,0.0,0.575364,1.386294,0.0,0.0,1.386294,0.0,0.0,0.0,1.386294,1.386294,0.0,0.0,0.0,0.0,1.386294,1.386294,0.0,1.386294,0.0,0.0,0.0,0.693147,0.0,1.386294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.386294,1.386294,0.0,0.0,0.693147,0.0,0.0,0.693147,1.386294,0.0,0.0,0.0,1.386294,0.0,0.0,1.386294,0.0
1,0.0,0.0,1.386294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.386294,1.386294,0.287682,0.0,1.386294,0.0,0.0,0.0,0.0,1.386294,0.0,0.0,0.0,0.0,1.386294,1.386294,0.0,0.0,0.0,0.0,0.0,0.0,1.386294,0.693147,1.386294,0.0,0.0,1.386294,0.0,1.386294,0.0,0.0,0.0,1.386294,0.0,0.0,0.0,1.386294,0.693147,0.0,0.0,0.0,0.0,1.386294,0.0,0.0,0.0,0.0,1.386294,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.386294,0.0,1.386294,0.0,0.0,0.575364,0.693147,0.0,0.0,0.0,1.386294,0.0,0.0,0.0,0.0,1.386294,0.0,0.0,0.0,0.0,0.0,2.772589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.386294,0.0,1.386294,1.386294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.386294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.386294
3,0.0,0.0,0.0,0.693147,0.0,0.693147,0.0,1.386294,0.0,0.0,1.386294,0.0,0.0,0.0,0.0,0.0,0.0,1.386294,0.0,0.0,1.386294,0.0,0.0,0.0,0.0,1.386294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.386294,0.0,0.0,0.0,0.0,0.0,1.386294,0.0,0.0,0.0,1.386294,0.0,0.0,1.386294,1.386294,0.0,0.0,0.0,1.386294,1.386294,0.0,1.386294,0.0,0.0,0.0


In [17]:
# Sub linear tf scaling
count_vectors_inf=np.where(count_vectors.toarray()==0,float('inf'),count_vectors.toarray())
count_vectors_sublinear=np.where(np.log(count_vectors_inf)==float('inf'),0,1+np.log(count_vectors_inf))

### wf-idf

In [30]:
pd.DataFrame(count_vectors_sublinear*np.log(N/document_frequencies),columns=[count_vectorizer.get_feature_names()])

Unnamed: 0,accomplish,algorithm,application,approach,automatically,available,based,broad,build,carry,category,certain,closely,computational,computer,data,delivers,depending,discipline,discovering,divided,domain,employ,experience,explicitly,feedback,field,focus,fully,improve,involves,known,learning,machine,making,mathematical,method,model,nature,optimization,perform,prediction,programmed,provided,reinforcement,related,sample,satisfactory,signal,statistic,study,supervised,system,task,teach,theory,three,traditionally,training,unsupervised,using,various,without
0,1.386294,2.909294,0.0,0.693147,1.386294,0.693147,1.386294,0.0,1.386294,0.0,0.0,0.0,0.0,0.0,0.487088,1.1736,0.0,0.0,1.386294,0.0,0.0,0.0,1.386294,1.386294,0.0,0.0,0.0,0.0,1.386294,1.386294,0.0,1.386294,0.0,0.0,0.0,0.693147,0.0,1.386294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.386294,1.386294,0.0,0.0,0.693147,0.0,0.0,0.693147,1.386294,0.0,0.0,0.0,1.386294,0.0,0.0,1.386294,0.0
1,0.0,0.0,1.386294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.386294,1.386294,0.287682,0.0,1.386294,0.0,0.0,0.0,0.0,1.386294,0.0,0.0,0.0,0.0,1.386294,1.386294,0.0,0.0,0.0,0.0,0.0,0.0,1.386294,0.693147,1.386294,0.0,0.0,1.386294,0.0,1.386294,0.0,0.0,0.0,1.386294,0.0,0.0,0.0,1.386294,0.693147,0.0,0.0,0.0,0.0,1.386294,0.0,0.0,0.0,0.0,1.386294,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.386294,0.0,1.386294,0.0,0.0,0.487088,0.693147,0.0,0.0,0.0,1.386294,0.0,0.0,0.0,0.0,1.386294,0.0,0.0,0.0,0.0,0.0,2.3472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.386294,0.0,1.386294,1.386294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.1736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.386294
3,0.0,0.0,0.0,0.693147,0.0,0.693147,0.0,1.386294,0.0,0.0,1.386294,0.0,0.0,0.0,0.0,0.0,0.0,1.386294,0.0,0.0,1.386294,0.0,0.0,0.0,0.0,1.386294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.386294,0.0,0.0,0.0,0.0,0.0,1.386294,0.0,0.0,0.0,1.386294,0.0,0.0,1.386294,1.386294,0.0,0.0,0.0,1.386294,1.386294,0.0,1.386294,0.0,0.0,0.0


In [20]:
# Maximum tf normalization
a=0.4
count_vectors_maxnormalized=a+(1-a)*count_vectors.toarray()/np.max(count_vectors.toarray(),axis=1).reshape(-1,1)

### ntf-idf

In [31]:
pd.DataFrame(count_vectors_maxnormalized*np.log(N/document_frequencies),columns=[count_vectorizer.get_feature_names()])

Unnamed: 0,accomplish,algorithm,application,approach,automatically,available,based,broad,build,carry,category,certain,closely,computational,computer,data,delivers,depending,discipline,discovering,divided,domain,employ,experience,explicitly,feedback,field,focus,fully,improve,involves,known,learning,machine,making,mathematical,method,model,nature,optimization,perform,prediction,programmed,provided,reinforcement,related,sample,satisfactory,signal,statistic,study,supervised,system,task,teach,theory,three,traditionally,training,unsupervised,using,various,without
0,0.831777,1.386294,0.554518,0.415888,0.831777,0.415888,0.831777,0.554518,0.831777,0.554518,0.554518,0.554518,0.554518,0.554518,0.230146,0.554518,0.554518,0.554518,0.831777,0.554518,0.554518,0.554518,0.831777,0.831777,0.554518,0.554518,0.554518,0.554518,0.831777,0.831777,0.554518,0.831777,0.0,0.0,0.554518,0.415888,0.554518,0.831777,0.554518,0.554518,0.554518,0.554518,0.554518,0.554518,0.554518,0.554518,0.831777,0.831777,0.554518,0.554518,0.415888,0.554518,0.554518,0.415888,0.831777,0.554518,0.554518,0.554518,0.831777,0.554518,0.554518,0.831777,0.554518
1,0.554518,0.554518,0.970406,0.277259,0.554518,0.277259,0.554518,0.554518,0.554518,0.554518,0.554518,0.554518,0.970406,0.970406,0.201377,0.277259,0.970406,0.554518,0.554518,0.554518,0.554518,0.970406,0.554518,0.554518,0.554518,0.554518,0.970406,0.970406,0.554518,0.554518,0.554518,0.554518,0.0,0.0,0.970406,0.485203,0.970406,0.554518,0.554518,0.970406,0.554518,0.970406,0.554518,0.554518,0.554518,0.970406,0.554518,0.554518,0.554518,0.970406,0.485203,0.554518,0.554518,0.277259,0.554518,0.970406,0.554518,0.554518,0.554518,0.554518,0.970406,0.554518,0.554518
2,0.554518,0.554518,0.554518,0.277259,0.554518,0.277259,0.554518,0.554518,0.554518,0.970406,0.554518,0.970406,0.554518,0.554518,0.287682,0.485203,0.554518,0.554518,0.554518,0.970406,0.554518,0.554518,0.554518,0.554518,0.970406,0.554518,0.554518,0.554518,0.554518,0.554518,1.386294,0.554518,0.0,0.0,0.554518,0.277259,0.554518,0.554518,0.554518,0.554518,0.970406,0.554518,0.970406,0.970406,0.554518,0.554518,0.554518,0.554518,0.554518,0.554518,0.277259,0.554518,0.554518,0.693147,0.554518,0.554518,0.554518,0.554518,0.554518,0.554518,0.554518,0.554518,0.970406
3,0.554518,0.554518,0.554518,0.485203,0.554518,0.485203,0.554518,0.970406,0.554518,0.554518,0.970406,0.554518,0.554518,0.554518,0.115073,0.277259,0.554518,0.970406,0.554518,0.554518,0.970406,0.554518,0.554518,0.554518,0.554518,0.970406,0.554518,0.554518,0.554518,0.554518,0.554518,0.554518,0.0,0.0,0.554518,0.277259,0.554518,0.554518,0.970406,0.554518,0.554518,0.554518,0.554518,0.554518,0.970406,0.554518,0.554518,0.554518,0.970406,0.554518,0.277259,0.970406,0.970406,0.277259,0.554518,0.554518,0.970406,0.970406,0.554518,0.970406,0.554518,0.554518,0.554518
