In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline

In [2]:
data = pd.read_csv("title_dataset.csv",error_bad_lines=False,usecols =["title_text"])
data.head()

Unnamed: 0,title_text
0,BAHIA COCOA REVIEW
1,STANDARD OIL <SRD> TO FORM FINANCIAL UNIT
2,TEXAS COMMERCE BANCSHARES <TCB> FILES PLAN
3,TALKING POINT/BANKAMERICA <BAC> EQUITY OFFER
4,NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESERVE


In [3]:
data[data['title_text'].duplicated(keep=False)].sort_values('title_text').head(8)

Unnamed: 0,title_text
1948,Bundesbank says it leaves credit policies unc...
6989,Bundesbank says it leaves credit policies unc...
8100,Bundesbank sets 28-day securities repurchase ...
13258,Bundesbank sets 28-day securities repurchase ...
20476,<ACC CORP> 3RD QTR NET
20193,<ACC CORP> 3RD QTR NET
499,<FRANKLIN CALIFORNIA TAX-FREE INCOME FUND>PAYOUT
19061,<FRANKLIN CALIFORNIA TAX-FREE INCOME FUND>PAYOUT


In [4]:
data = data.drop_duplicates('title_text')

In [5]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [6]:
import nltk
import re

def tokenize_and_stem(text, do_stem=True):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
            
    # stem filtered tokens
    stems = [stemmer.stem(t) for t in filtered_tokens]
    
    if do_stem:
        return stems
    else:
        return filtered_tokens

In [7]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in data['title_text']:
    allwords_stemmed = tokenize_and_stem(i)
    totalvocab_stemmed.extend(allwords_stemmed)
    
    allwords_tokenized = tokenize_and_stem(i, False)
    totalvocab_tokenized.extend(allwords_tokenized)

In [8]:
from sklearn.feature_extraction import text

eng_contractions = ["ain't", "amn't", "aren't", "can't", "could've", "couldn't",
                    "daresn't", "didn't", "doesn't", "don't", "gonna", "gotta", 
                    "hadn't", "hasn't", "haven't", "he'd", "he'll", "he's", "how'd",
                    "how'll", "how's", "I'd", "I'll", "I'm", "I've", "isn't", "it'd",
                    "it'll", "it's", "let's", "mayn't", "may've", "mightn't", 
                    "might've", "mustn't", "must've", "needn't", "o'clock", "ol'",
                    "oughtn't", "shan't", "she'd", "she'll", "she's", "should've",
                    "shouldn't", "somebody's", "someone's", "something's", "that'll",
                    "that're", "that's", "that'd", "there'd", "there're", "there's", 
                    "these're", "they'd", "they'll", "they're", "they've", "this's",
                    "those're", "tis", "twas", "twasn't", "wasn't", "we'd", "we'd've",
                    "we'll", "we're", "we've", "weren't", "what'd", "what'll", 
                    "what're", "what's", "what've", "when's", "where'd", "where're",
                    "where's", "where've", "which's", "who'd", "who'd've", "who'll",
                    "who're", "who's", "who've", "why'd", "why're", "why's", "won't",
                    "would've", "wouldn't", "y'all", "you'd", "you'll", "you're", 
                    "you've", "'s", "s"
                     ]

nltk.download('stopwords')
nltk.download('punkt')

custom_stopwords = text.ENGLISH_STOP_WORDS.union(eng_contractions)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
vocab_frame.head()

Unnamed: 0,words
bahia,bahia
cocoa,cocoa
review,review
standard,standard
oil,oil


In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=200000,
                                 min_df=0.1, stop_words=custom_stopwords,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(data['title_text']) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

terms = tfidf_vectorizer.get_feature_names()

  'stop_words.' % sorted(inconsistent))


(20029, 1)


In [87]:
type(data['title_text'])

pandas.core.series.Series

In [100]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=1.0, max_features=200000,
                                 min_df=0.0, stop_words=custom_stopwords,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_matrix_test = tfidf_vectorizer.fit_transform(['BAHIA COCOA REVIEW']) #fit the vectorizer to synopses

# print(tfidf_matrix.shape)

# terms = tfidf_vectorizer.get_feature_names()

  'stop_words.' % sorted(inconsistent))


In [94]:
from sklearn.cluster import KMeans
import math

num_clusters = int(math.sqrt(data.shape[0] / 2) * 1.5)

km = KMeans(algorithm='auto', copy_x=True, init='random', max_iter=1000,
    n_clusters=10, n_init=30, n_jobs=1, precompute_distances='auto',
    random_state=10, tol=0.0001, verbose=0)

km.fit(tfidf_matrix)

  # Remove the CWD from sys.path while we load stuff.


KMeans(init='random', max_iter=1000, n_clusters=10, n_init=30, n_jobs=1,
       precompute_distances='auto', random_state=10)

In [95]:
clusters = km.labels_.tolist()

data['cluster'] = clusters
data.to_csv("title_dataset_out.csv", sep=',', encoding='utf-8')

In [96]:
print("Top terms per cluster:")
print()

#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(10):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0], end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d titles:" % i, end='')
    print()
    print(len(data[data['cluster'] == i]['title_text'].values.tolist()))
#         print(' - %s' % title)
    print() #add whitespace
    print() #add whitespace
    
print()
print()

Top terms per cluster:

Cluster 0 words: qtr,

Cluster 0 titles:
17925


Cluster 1 words: qtr,

Cluster 1 titles:
2104


Cluster 2 words: qtr,

Cluster 2 titles:
0


Cluster 3 words: qtr,

Cluster 3 titles:
0


Cluster 4 words: qtr,

Cluster 4 titles:
0


Cluster 5 words: qtr,

Cluster 5 titles:
0


Cluster 6 words: qtr,

Cluster 6 titles:
0


Cluster 7 words: qtr,

Cluster 7 titles:
0


Cluster 8 words: qtr,

Cluster 8 titles:
0


Cluster 9 words: qtr,

Cluster 9 titles:
0






In [101]:
z = km.predict(tfidf_matrix_test)

In [102]:
z[0]

1