In [14]:
# 0096_nlp
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import pandas as pd

In [1]:
class TokenProcessor:

    def __init__(self):
        self._stemmer = None
        self._stop_words = None
        self._lemmatizer = None
        self._ner_tagger = None
        self._pos_tagger = None

    @property
    def stemmer(self):
        if self._stemmer is None:
            self._stemmer = PorterStemmer()
        return self._stemmer

    def stem(self, words):
        stemmer = self.stemmer

        res = []
        for w in words:
            rw = stemmer.stem(w)
            res.append(rw)

        return res

    @property
    def stop_words(self):
        if self._stop_words is None:
            self._stop_words = set(stopwords.words('english'))
        return self._stop_words

    def clean_stop_words(self, words):
        stop_words = self.stop_words

        res = []
        for w in words:
            w_test = w[0] if isinstance(w, tuple) else w
            if w_test.lower() not in stop_words:
                res.append(w)

        return res

    @property
    def lemmatizer(self):
        if self._lemmatizer is None:
            self._lemmatizer = WordNetLemmatizer()

        return self._lemmatizer

    def lemmatize(self, words):
        lemmatizer = self.lemmatizer

        res = []
        for w in words:
            word = lemmatizer.lemmatize(w.lower())
            res.append(word)

        return res

    @staticmethod
    def clean_digits(words):
        return [w for w in words if not w.isdigit()]

    @staticmethod
    def upper(words):
        return [w.upper() for w in words]

    @staticmethod
    def lower(words):
        return [w.lower() for w in words]

    @staticmethod
    def sentence(words):
        return " ".join(words)


class TextProcessor:

    def __init__(self):
        pass

    @staticmethod
    def clean_punctuations(text):
        return text.translate(str.maketrans('', '', string.punctuation))

    @staticmethod
    def to_words(text):
        return word_tokenize(text)

    @staticmethod
    def to_sentences(text):
        return sent_tokenize(text)


class NLPProcessor:

    def __init__(self, text_kwargs={}, token_kwargs={}):

        self.txtp = TextProcessor(**text_kwargs)
        self.tokp = TokenProcessor(**token_kwargs)

    def clean_words(self, sentence):
        txtp = self.txtp
        tokp = self.tokp

        res = txtp.clean_punctuations(sentence)

        words = txtp.to_words(res)
        words = tokp.clean_digits(words)
        words = tokp.lemmatize(words)
        words = tokp.upper(words)
        words = tokp.clean_stop_words(words)

        return words

    def clean_sentence(self, sentence):
        words = self.clean_words(sentence)

        return self.tokp.sentence(words)

In [2]:
nlpp = NLPProcessor()

In [15]:
s1 = "Hello there! How're things going? It's been a long time."
s2 = "the red riding hood got lost in the forest"
s3 = "I am think of moving to a new country. But having a hard time deciding"
s4 = "what should I do when I see a new dress I like but cost too much?"

df = pd.DataFrame(data={
    "event_id": [1, 2, 3, 4],
    "sentence": [s1, s2, s3, s4]
})

In [17]:
df["clean"] = df["sentence"].apply(lambda x: nlpp.clean_sentence(x))

In [18]:
df

Unnamed: 0,event_id,sentence,clean
0,1,Hello there! How're things going? It's been a ...,HELLO HOWRE THING GOING LONG TIME
1,2,the red riding hood got lost in the forest,RED RIDING HOOD GOT LOST FOREST
2,3,I am think of moving to a new country. But hav...,THINK MOVING NEW COUNTRY HARD TIME DECIDING
3,4,what should I do when I see a new dress I like...,SEE NEW DRESS LIKE COST MUCH


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.7, max_features=200,
                                   min_df=0.2, use_idf=True)

tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean'].values)
print(tfidf_matrix.shape)

(4, 23)


In [20]:
tfidf_matrix

<4x23 sparse matrix of type '<class 'numpy.float64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [21]:
terms = tfidf_vectorizer.get_feature_names()
print(len(terms))
print(terms)

23
['cost', 'country', 'deciding', 'dress', 'forest', 'going', 'got', 'hard', 'hello', 'hood', 'howre', 'like', 'long', 'lost', 'moving', 'much', 'new', 'red', 'riding', 'see', 'thing', 'think', 'time']


In [24]:
from sklearn.cluster import KMeans

num_clusters = 2
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)

KMeans(n_clusters=2)

In [25]:
clusters = km.labels_.tolist()
df['cluster'] = clusters
print(df['cluster'].value_counts())

0    3
1    1
Name: cluster, dtype: int64


In [26]:
df

Unnamed: 0,event_id,sentence,clean,cluster
0,1,Hello there! How're things going? It's been a ...,HELLO HOWRE THING GOING LONG TIME,0
1,2,the red riding hood got lost in the forest,RED RIDING HOOD GOT LOST FOREST,1
2,3,I am think of moving to a new country. But hav...,THINK MOVING NEW COUNTRY HARD TIME DECIDING,0
3,4,what should I do when I see a new dress I like...,SEE NEW DRESS LIKE COST MUCH,0


In [29]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(num_clusters):
    print(f"cluster {i}:")
    print(f"\twords:")
    for ind in order_centroids[i, :5]:
        print(f"\t\t{terms[ind]}")
    print(f"\ttitles:")
    for title in df[df['cluster'] == i]['clean'].values.tolist()[:5]:
        print(f"\t\t{title}")
    print(f"\t\t...")

cluster 0:
	words:
		time
		new
		long
		dress
		going
	titles:
		HELLO HOWRE THING GOING LONG TIME
		THINK MOVING NEW COUNTRY HARD TIME DECIDING
		SEE NEW DRESS LIKE COST MUCH
		...
cluster 1:
	words:
		hood
		riding
		red
		forest
		lost
	titles:
		RED RIDING HOOD GOT LOST FOREST
		...
