In [5]:
#First you need to install nltk from
#        https://www.nltk.org/install.html   


%matplotlib inline

from pathlib import Path

from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
import nltk
from nltk import word_tokenize          
from nltk.stem.snowball import EnglishStemmer 
import matplotlib.pylab as plt
from dmba import printTermDocumentMatrix, classificationSummary, liftChart

nltk.download('punkt')

import warnings
warnings.filterwarnings('ignore')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tannertamondong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Activate CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

text = ['this is the first sentence.',
        'this is a second sentence.',
        'the third sentence is here.']

# Count Vectorizer
vect = CountVectorizer()  
vects = vect.fit_transform(text)


td = pd.DataFrame(vects.todense())
td.columns = vect.get_feature_names()
term_document_matrix = td.T
term_document_matrix.columns = ['Sentence '+str(i) for i in range(1, td.shape[0]+1)]
term_document_matrix['total_count'] = term_document_matrix.sum(axis=1)


print(term_document_matrix)

AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'

In [None]:
text = ['this is the first     sentence!!',
        'this is a second Sentence :)',
        'the third sentence, is here ',
        'forth of all sentences']

# Learn features based on text. 
count_vect = CountVectorizer()
counts = count_vect.fit_transform(text)


# Select the first five rows from the data set
td = pd.DataFrame(counts.todense())
td.columns = count_vect.get_feature_names()
term_document_matrix = td.T
term_document_matrix.columns = ['Sentence '+str(i) for i in range(1, td.shape[0]+1)]
term_document_matrix['total_count'] = term_document_matrix.sum(axis=1)


print(term_document_matrix)

In [None]:
td

In [None]:
text = ['this is the first     sentence!!',
        'this is a second Sentence :)',
        'the third sentence, is here ',
        'forth of all sentences']

# Learn features based on text. Include special characters that are part of a word in the analysis
count_vect = CountVectorizer(token_pattern='[a-zA-Z!:)]+')
counts = count_vect.fit_transform(text)

td = pd.DataFrame(counts.todense())
td.columns = count_vect.get_feature_names()
term_document_matrix = td.T
term_document_matrix.columns = ['Sentence '+str(i) for i in range(1, td.shape[0]+1)]
term_document_matrix['total_count'] = term_document_matrix.sum(axis=1)


print(term_document_matrix)


In [None]:
stopWords = list(sorted(ENGLISH_STOP_WORDS))
ncolumns = 10; nrows= 2

print('First {} of {} stopwords'.format(ncolumns * nrows, len(stopWords)))
for i in range(0, len(stopWords[:(ncolumns * nrows)]), ncolumns):
    print(''.join(word.ljust(13) for word in stopWords[i:(i+ncolumns)]))

In [None]:
text = ['this is the first     sentence!! ',
        'this is a second Sentence :)',
        'the third sentence, is here ',
        'forth of all sentences']

# Create a custom tokenizer that will use NLTK for tokenizing and lemmatizing 
# (removes interpunctuation and stop words)
class LemmaTokenizer(object):
    def __init__(self):
        self.stemmer = EnglishStemmer()
        self.stopWords = set(ENGLISH_STOP_WORDS)

    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in word_tokenize(doc) 
                if t.isalpha() and t not in self.stopWords]

# Learn features based on text
count_vect = CountVectorizer(tokenizer=LemmaTokenizer())
counts = count_vect.fit_transform(text)

td = pd.DataFrame(counts.todense())
td.columns = count_vect.get_feature_names()
term_document_matrix = td.T
term_document_matrix.columns = ['Sentence '+str(i) for i in range(1, td.shape[0]+1)]
term_document_matrix['total_count'] = term_document_matrix.sum(axis=1)



print(term_document_matrix)

In [None]:
text = ['this is the first     sentence!!',
        'this is a second Sentence :)',
        'the third sentence, is here ',
        'forth of all sentences']

# Apply CountVectorizer and TfidfTransformer sequentially
count_vect = CountVectorizer()
tfidfTransformer = TfidfTransformer(smooth_idf=False, norm=None)
counts = count_vect.fit_transform(text)
tfidf = tfidfTransformer.fit_transform(counts)

td = pd.DataFrame(tfidf.todense())
td.columns = count_vect.get_feature_names()
term_document_matrix = td.T
term_document_matrix.columns = ['Sentence '+str(i) for i in range(1, td.shape[0]+1)]



print(term_document_matrix)

In [None]:
# This example illustrates a classification task—to classify Internet discussion posts
#as either auto-related or electronics-related.

#Step 1: import and label records
corpus = []
label = []
with ZipFile('AutoAndElectronics.zip') as rawData:
    for info in rawData.infolist():
        if info.is_dir(): 
            continue
        label.append(1 if 'rec.autos' in info.filename else 0)
        corpus.append(rawData.read(info))

# Step 2: preprocessing (tokenization, stemming, and stopwords)
class LemmaTokenizer(object):
    def __init__(self):
        self.stemmer = EnglishStemmer()
        self.stopWords = set(ENGLISH_STOP_WORDS)
    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in word_tokenize(doc) 
                if t.isalpha() and t not in self.stopWords]

preprocessor = CountVectorizer(tokenizer=LemmaTokenizer(), encoding='latin1')
preprocessedText = preprocessor.fit_transform(corpus)

In [6]:
# Select the first five rows from the data set
td = pd.DataFrame(preprocessedText.todense())
td.columns = preprocessor.get_feature_names()
term_document_matrix = td.T
term_document_matrix.columns = ['Sentence '+str(i) for i in range(1, td.shape[0]+1)]
term_document_matrix['total_count'] = term_document_matrix.sum(axis=1)

#Top 25 words 
term_document_matrix = term_document_matrix.sort_values(by ='total_count',ascending=False)[:25] 

# Print the first 10 rows 
print(term_document_matrix.drop(columns=['total_count']).head(10))

print(term_document_matrix)

NameError: name 'preprocessedText' is not defined

In [7]:
preprocessedText.shape

NameError: name 'preprocessedText' is not defined

In [None]:

# Step 3: TF-IDF and latent semantic analysis
tfidfTransformer = TfidfTransformer()
tfidf = tfidfTransformer.fit_transform(preprocessedText)

# Extract 20 concepts using LSA ()
svd = TruncatedSVD(20)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

lsa_tfidf = lsa.fit_transform(tfidf)

In [None]:
# split dataset into 60% training and 40% test set
Xtrain, Xtest, ytrain, ytest = train_test_split(lsa_tfidf, label, test_size=0.4, random_state=0)

# run logistic regression model on training
logit_reg = LogisticRegression(solver='lbfgs')
logit_reg.fit(Xtrain, ytrain)

# print confusion matrix and accuracty
classificationSummary(ytest, logit_reg.predict(Xtest))

Confusion Matrix (Accuracy 0.9587)

       Prediction
Actual   0   1
     0 375  14
     1  19 392
