## text classification etc.

In [1]:
def find_sentiment(sentence, pos, neg):
    sentence = sentence.split()
    sentence = set(sentence)

    num_common_pos = len(sentence.intersection(pos))
    num_common_neg = len(sentence.intersection(neg))

    if num_common_pos > num_common_neg:
        return "positive"
    if num_common_pos < num_common_neg:
        return "negative"
    return neutral

In [2]:
from nltk.tokenize import word_tokenize

sentence = "hi, how are you?"
sentence.split()

['hi,', 'how', 'are', 'you?']

In [3]:
word_tokenize(sentence)

['hi', ',', 'how', 'are', 'you', '?']

In [4]:
# bag of word as initial trial

from sklearn.feature_extraction.text import  CountVectorizer

corpus = [
    "hello, how are you?",
    "im getting bored at home. And you? What do you think?", "did you know about counts",
    "let's see if this works!",
    "YES!!!!"
]
ctv = CountVectorizer()
ctv.fit(corpus)
corpus_transformed = ctv.transform(corpus)
corpus_transformed

<5x23 sparse matrix of type '<class 'numpy.int64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [5]:
ctv.vocabulary_

{'hello': 9,
 'how': 11,
 'are': 2,
 'you': 22,
 'im': 13,
 'getting': 8,
 'bored': 4,
 'at': 3,
 'home': 10,
 'and': 1,
 'what': 19,
 'do': 7,
 'think': 17,
 'did': 6,
 'know': 14,
 'about': 0,
 'counts': 5,
 'let': 15,
 'see': 16,
 'if': 12,
 'this': 18,
 'works': 20,
 'yes': 21}

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import  word_tokenize

ctv = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
ctv.fit(corpus)
corpus_transformed = ctv.transform(corpus)
print(ctv.vocabulary_)

{'hello': 14, ',': 2, 'how': 16, 'are': 7, 'you': 27, '?': 4, 'im': 18, 'getting': 13, 'bored': 9, 'at': 8, 'home': 15, '.': 3, 'and': 6, 'what': 24, 'do': 12, 'think': 22, 'did': 11, 'know': 19, 'about': 5, 'counts': 10, 'let': 20, "'s": 1, 'see': 21, 'if': 17, 'this': 23, 'works': 25, '!': 0, 'yes': 26}


In [7]:
from sklearn.feature_extraction.text import  TfidfVectorizer
from nltk.tokenize import word_tokenize

tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
tfv.fit(corpus)
corpus_transformed = tfv.transform(corpus)
corpus_transformed

<5x28 sparse matrix of type '<class 'numpy.float64'>'
	with 32 stored elements in Compressed Sparse Row format>

In [9]:
corpus_transformed.toarray().shape

(5, 28)

In [10]:
from nltk import  ngrams
from nltk.tokenize import word_tokenize 

N = 3
sentence = "hi, how are you?"
tokenized_sentence = word_tokenize(sentence)
print(tokenized_sentence)

n_grams = list(ngrams(tokenized_sentence, N))
n_grams

['hi', ',', 'how', 'are', 'you', '?']


[('hi', ',', 'how'),
 (',', 'how', 'are'),
 ('how', 'are', 'you'),
 ('are', 'you', '?')]

In [14]:
# stemming and lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import  SnowballStemmer

lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer(language="english")

words = ["fishing", "fishes", "fished"]
for word in words:
    print(f"word={word}")
    print(f"stemmed_word={stemmer.stem(word)}")
    print(f"lemma={lemmatizer.lemmatize(word)}")

word=fishing
stemmed_word=fish
lemma=fishing
word=fishes
stemmed_word=fish
lemma=fish
word=fished
stemmed_word=fish
lemma=fished


In [15]:
# apply non-negative matrix factorization (NMF) or latent semantic analysis (LSA) --> singular vector decomposition (SVD)

import pandas as pd 
from nltk.tokenize import  word_tokenize
from sklearn import  decomposition
from sklearn.feature_extraction.text import  TfidfVectorizer

imdb_corpus = pd.read_csv("./datasets/imdb/Dataset.csv", nrows=10000)
imdb_corpus = imdb_corpus.review.values

In [17]:
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
tfv.fit(imdb_corpus)
corpus_transformed = tfv.transform(imdb_corpus)
corpus_transformed.toarray().shape

(10000, 70447)

In [19]:
svd = decomposition.TruncatedSVD(n_components=10)
corpus_svd = svd.fit(corpus_transformed)

In [22]:
idx = 0
feature_scores = dict(
    zip(
        tfv.get_feature_names_out(),
        corpus_svd.components_[idx]
    )
)
len(feature_scores)

70447

In [23]:
N = 5
sorted(feature_scores, key=feature_scores.get, reverse=True)[:N]

['the', ',', '.', 'a', 'and']

In [24]:
import re, string

def clean_text(s):
    s = s.split()
    s = " ".join(s)
    s = re.sub(f"[{re.escape(string.punctuation)}]", "", s)
    return s 

In [26]:
s = "oh, are you kidding me?!"
clean_text(s)

'oh are you kidding me'

In [28]:
imdb_corpus = pd.read_csv("./datasets/imdb/Dataset.csv", nrows=10000)
imdb_corpus.loc[:, "clean_review"] = imdb_corpus.review.apply(clean_text)
imdb_corpus.head()

Unnamed: 0,review,sentiment,clean_review
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production br br The filmin...
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,Basically theres a family where a little boy J...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,Petter Matteis Love in the Time of Money is a ...


In [33]:
corpus = imdb_corpus.review.values
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
tfv.fit(corpus)
corpus_transformed = tfv.transform(corpus)

svd = decomposition.TruncatedSVD(n_components=10)
corpus_svd = svd.fit(corpus_transformed)

for idx in range(5):
    feature_scores = dict(
    zip(
        tfv.get_feature_names_out(),
        corpus_svd.components_[idx]
        )
    )
    print(sorted(feature_scores, key=feature_scores.get, reverse=True)[:5])

['the', ',', '.', 'a', 'and']
['br', '<', '>', '/', '-']
['i', 'movie', '!', 'it', 'was']
[',', '!', "''", '``', 'you']
['!', 'the', "''", '``', '...']


In [34]:
corpus = imdb_corpus.clean_review.values
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
tfv.fit(corpus)
corpus_transformed = tfv.transform(corpus)

svd = decomposition.TruncatedSVD(n_components=10)
corpus_svd = svd.fit(corpus_transformed)

for idx in range(5):
    feature_scores = dict(
    zip(
        tfv.get_feature_names_out(),
        corpus_svd.components_[idx]
        )
    )
    print(sorted(feature_scores, key=feature_scores.get, reverse=True)[:5])

['the', 'a', 'and', 'of', 'to']
['i', 'movie', 'it', 'was', 'this']
['the', 'was', 'i', 'were', 'of']
['her', 'was', 'she', 'i', 'he']
['br', 'to', 'they', 'he', 'show']
