In [3]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/dua/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [4]:
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
# initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# initialize stemmer
stemmer = SnowballStemmer("english")
words = ["fishing", "fishes", "fished"]
for word in words:
    print(f"word={word}")
    print(f"stemmed_word={stemmer.stem(word)}")
    print(f"lemma={lemmatizer.lemmatize(word)}")
    print("")


word=fishing
stemmed_word=fish
lemma=fishing

word=fishes
stemmed_word=fish
lemma=fish

word=fished
stemmed_word=fish
lemma=fished



In [8]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfVectorizer
# create a corpus of sentences
# read only 10k samples from training data

corpus = pd.read_csv("/home/dua/Documents/text_classify_regression/input /IMDB Dataset.csv", nrows=10000)
corpus = corpus.review.values
# initialize TfidfVectorizer with word_tokenize from nltk as the tokenizer
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
# fit the vectorizer on corpus
tfv.fit(corpus)
# transform the corpus using tfidf
corpus_transformed = tfv.transform(corpus)
# initialize SVD with 10 components
svd = decomposition.TruncatedSVD(n_components=10)
# fit SVD
corpus_svd = svd.fit(corpus_transformed)
# choose first sample and create a dictionary of feature names and their scores from svd
sample_index = 0
feature_scores = dict(
    zip(
        tfv.get_feature_names(),
        corpus_svd.components_[sample_index]
    )
)
# once have the dictionary, can sort it in decreasing order and get the top N topics
N = 5
print(sorted(feature_scores, key=feature_scores.get, reverse=True)[:N])

['the', ',', '.', 'and', 'a']


In [9]:
"""
Clean any text data, especially when it's in pandas dataframe. 
example: convert a string like "hi, how are you???" to "hi how are you"
"""

import re
import string
def clean_text(s):
    """
    This function cleans the text a bit
    s: string
    return: cleaned string
    """
    # split by all whitespaces
    s = s.split()
    # join tokens by single space >>  remove all kinds of weird space
    # "hi.   how are you" becomes "hi. how are you"
    s = " ".join(s)
    # remove all punctuations using regex and string module
    s = re.sub(f'[{re.escape(string.punctuation)}]', '', s)
    # you can add more cleaning here if you want
    # and then return the cleaned string
    return s

In [None]:
"""
apply above function to the old SVD code 
"""

import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfVectorizer
# create a corpus of sentences
# read only 10k samples from training data

corpus = pd.read_csv("/home/dua/Documents/text_classify_regression/input /IMDB Dataset.csv", nrows=10000)

corpus.loc[:, "review"] = corpus.review.apply(clean_text)

# initialize TfidfVectorizer with word_tokenize from nltk as the tokenizer
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
# fit the vectorizer on corpus
tfv.fit(corpus)
# transform the corpus using tfidf
corpus_transformed = tfv.transform(corpus)
# initialize SVD with 10 components
svd = decomposition.TruncatedSVD(n_components=10)
# fit SVD
corpus_svd = svd.fit(corpus_transformed)
# choose first sample and create a dictionary of feature names and their scores from svd
sample_index = 0
feature_scores = dict(
    zip(
        tfv.get_feature_names(),
        corpus_svd.components_[sample_index]
    )
)
# once have the dictionary, can sort it in decreasing order and get the top N topics
N = 5
print(sorted(feature_scores, key=feature_scores.get, reverse=True)[:N])

In [17]:
#sentence vector 
import numpy as np
def sentence_to_vec(s, embedding_dict, stop_words, tokenizer):
    """
    Given a sentence and other information, this function returns embedding for the whole sentence
    s: sentence, string
    embedding_dict: dictionary word:vector
    stop_words: list of stop words, if any
    tokenizer: a tokenization function
    """
    # convert sentence to string and lowercase it
    words = str(s).lower()
    # tokenize the sentence
    words = tokenizer(words)
    # remove stop word tokens
    words = [w for w in words if not w in stop_words]
    # keep only alpha-numeric tokens
    words = [w for w in words if w.isalpha()]
    # initialize empty list to store embeddings
    M = []
    for w in words:
        # for evert word, fetch the embedding from the dictionary and append to list of embeddings
        if w in embedding_dict:
            M.append(embedding_dict[w])
    # if we dont have any vectors, return zeros
    if len(M) == 0:
        return np.zeros(300)
    # convert list of embeddings to array
    M = np.array(M)
    # calculate sum over axis=0
    v = M.sum(axis=0)
    # return normalized vector
    return v / np.sqrt((v ** 2).sum())