## Setup

In [1]:
# Reference: https://www.dezyre.com/article/10-nlp-techniques-every-data-scientist-should-know/415

import pandas as pd
import numpy as np
import re
from sklearn.datasets import fetch_20newsgroups

df = fetch_20newsgroups(subset='train')

In [2]:
print(df.data[1])

From: guykuo@carson.u.washington.edu (Guy Kuo)
Subject: SI Clock Poll - Final Call
Summary: Final call for SI clock reports
Keywords: SI,acceleration,clock,upgrade
Article-I.D.: shelley.1qvfo9INNc3s
Organization: University of Washington
Lines: 11
NNTP-Posting-Host: carson.u.washington.edu

A fair number of brave souls who upgraded their SI clock oscillator have
shared their experiences for this poll. Please send a brief message detailing
your experiences with the procedure. Top speed attained, CPU rated speed,
add on cards and adapters, heat sinks, hour of usage per day, floppy disk
functionality with 800 and 1.4 m floppies are especially requested.

I will be summarizing in the next two days, so please add to the network
knowledge base if you have done the clock upgrade and haven't answered this
poll. Thanks.

Guy Kuo <guykuo@u.washington.edu>



## 1) Tokenization

In [3]:
def clean_str(string):
    string = re.sub(r"\\n", "", string) # removes new-line characters
    string = re.sub(r"[^A-Za-z]", " ", string) # removes numbers, symbols
    return string

cleaned_text = []

for text in df.data:
    cleaned_text.append(clean_str(text))
    
print(cleaned_text[1])

From  guykuo carson u washington edu  Guy Kuo  Subject  SI Clock Poll   Final Call Summary  Final call for SI clock reports Keywords  SI acceleration clock upgrade Article I D   shelley  qvfo INNc s Organization  University of Washington Lines     NNTP Posting Host  carson u washington edu  A fair number of brave souls who upgraded their SI clock oscillator have shared their experiences for this poll  Please send a brief message detailing your experiences with the procedure  Top speed attained  CPU rated speed  add on cards and adapters  heat sinks  hour of usage per day  floppy disk functionality with     and     m floppies are especially requested   I will be summarizing in the next two days  so please add to the network knowledge base if you have done the clock upgrade and haven t answered this poll  Thanks   Guy Kuo  guykuo u washington edu  


In [4]:
from nltk.tokenize import word_tokenize

tokenized = []

for article in cleaned_text:
    tokenized.append(word_tokenize(article))
    
print(tokenized[1])

['From', 'guykuo', 'carson', 'u', 'washington', 'edu', 'Guy', 'Kuo', 'Subject', 'SI', 'Clock', 'Poll', 'Final', 'Call', 'Summary', 'Final', 'call', 'for', 'SI', 'clock', 'reports', 'Keywords', 'SI', 'acceleration', 'clock', 'upgrade', 'Article', 'I', 'D', 'shelley', 'qvfo', 'INNc', 's', 'Organization', 'University', 'of', 'Washington', 'Lines', 'NNTP', 'Posting', 'Host', 'carson', 'u', 'washington', 'edu', 'A', 'fair', 'number', 'of', 'brave', 'souls', 'who', 'upgraded', 'their', 'SI', 'clock', 'oscillator', 'have', 'shared', 'their', 'experiences', 'for', 'this', 'poll', 'Please', 'send', 'a', 'brief', 'message', 'detailing', 'your', 'experiences', 'with', 'the', 'procedure', 'Top', 'speed', 'attained', 'CPU', 'rated', 'speed', 'add', 'on', 'cards', 'and', 'adapters', 'heat', 'sinks', 'hour', 'of', 'usage', 'per', 'day', 'floppy', 'disk', 'functionality', 'with', 'and', 'm', 'floppies', 'are', 'especially', 'requested', 'I', 'will', 'be', 'summarizing', 'in', 'the', 'next', 'two', 'da

## 2) Stemming and Lemmatization

In [5]:
# Stemming
from nltk.stem import PorterStemmer

porter = PorterStemmer()

print('Porter Stemmer')
print('Root word for cats -->', porter.stem('cats'))
print('Root word for mice -->', porter.stem('mice'))
print('Root word for ran -->', porter.stem('ran'))

Porter Stemmer
Root word for cats --> cat
Root word for mice --> mice
Root word for ran --> ran


In [6]:
# Lemmatization
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'cats mice ran')
for token in doc:
    print(token, '-->', token.lemma_)

cats --> cat
mice --> mouse
ran --> run


In [7]:
lemmatized = []
c = []

for word in tokenized[1]:
    doc = nlp(word)
    for token in doc:
        c.append(token.lemma_)
lemmatized.append(c)

In [8]:
print(lemmatized[0])

['from', 'guykuo', 'carson', 'u', 'washington', 'edu', 'guy', 'Kuo', 'subject', 'SI', 'Clock', 'poll', 'final', 'call', 'summary', 'final', 'call', 'for', 'SI', 'clock', 'report', 'keyword', 'SI', 'acceleration', 'clock', 'upgrade', 'article', 'I', 'd', 'shelley', 'qvfo', 'INNc', 's', 'organization', 'university', 'of', 'Washington', 'line', 'NNTP', 'post', 'host', 'carson', 'u', 'washington', 'edu', 'a', 'fair', 'number', 'of', 'brave', 'soul', 'who', 'upgrade', 'their', 'SI', 'clock', 'oscillator', 'have', 'share', 'their', 'experience', 'for', 'this', 'poll', 'please', 'send', 'a', 'brief', 'message', 'detail', 'your', 'experience', 'with', 'the', 'procedure', 'top', 'speed', 'attain', 'cpu', 'rate', 'speed', 'add', 'on', 'card', 'and', 'adapter', 'heat', 'sink', 'hour', 'of', 'usage', 'per', 'day', 'floppy', 'disk', 'functionality', 'with', 'and', 'm', 'floppy', 'be', 'especially', 'request', 'I', 'will', 'be', 'summarize', 'in', 'the', 'next', 'two', 'day', 'so', 'please', 'add', 

## 3) Stop Words Removal

In [9]:
from spacy.lang.en.stop_words import STOP_WORDS
stop_words = list(STOP_WORDS)

stop_words_removed = []

for word in lemmatized[0]:
    if word not in stop_words:
        stop_words_removed.append(word)
        
print(stop_words_removed)

['guykuo', 'carson', 'u', 'washington', 'edu', 'guy', 'Kuo', 'subject', 'SI', 'Clock', 'poll', 'final', 'summary', 'final', 'SI', 'clock', 'report', 'keyword', 'SI', 'acceleration', 'clock', 'upgrade', 'article', 'I', 'd', 'shelley', 'qvfo', 'INNc', 's', 'organization', 'university', 'Washington', 'line', 'NNTP', 'post', 'host', 'carson', 'u', 'washington', 'edu', 'fair', 'number', 'brave', 'soul', 'upgrade', 'SI', 'clock', 'oscillator', 'share', 'experience', 'poll', 'send', 'brief', 'message', 'detail', 'experience', 'procedure', 'speed', 'attain', 'cpu', 'rate', 'speed', 'add', 'card', 'adapter', 'heat', 'sink', 'hour', 'usage', 'day', 'floppy', 'disk', 'functionality', 'm', 'floppy', 'especially', 'request', 'I', 'summarize', 'day', 'add', 'network', 'knowledge', 'base', 'clock', 'upgrade', 'haven', 't', 'answer', 'poll', 'thank', 'guy', 'Kuo', 'guykuo', 'u', 'washington', 'edu']


## 4) Keyword Extraction via TF-IDF: a statistical technique that tells how important a word is to a document in a collection of documents

#### TF(t,d) = count of t in d / number of words in d
#### IDF(t)  = N / occurences of t in N documents
#### TF-IDF = TF * IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df.data)

In [11]:
df = pd.DataFrame(X[0].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values("TF-IDF", ascending=False)
print(df.head())

          TF-IDF
car     0.381339
lerxst  0.353835
wam     0.259709
umd     0.211868
tellme  0.176918


## 5) Word Embeddings via Word2Vec

In [13]:
from gensim.models import keyedvectors as word2vec
from sklearn.metrics.pairwise import cosine_similarity

model = word2vec.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

test = cosine_similarity(model['king'].reshape(1, -1), model['queen'].reshape(1, -1))[0][0]
print(test)

0.65109575


## 6) Sentiment Analysis

In [30]:
import nltk
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Reading data
df = pd.read_csv("Twitter_Data.csv")

# Tokenizing data
tokenized = []
for text in df['clean_text'].values:
    tokenized.append(word_tokenize(str(text)))
    
# Lemmatize data
lemmatizer = WordNetLemmatizer()
lemmatized = []
for text in tokenized:
    c = []
    for word in text:
        c.append(lemmatizer.lemmatize(word))
    lemmatized.append(c)
    
# Remove stop words
stop_words_removed = []
for text in lemmatized:
    c = []
    for word in text:
        if word not in stop_words:
            c.append(word)
    stop_words_removed.append(c)

In [35]:
# Transform text to embeddings
vectors = []
for text in stop_words_removed:
    if text != []: # check if not all words were removed in stop word removal
        temp = []
        for word in text:
            try:
                temp.append(model[word])
            except:
                pass
        if temp != []: # to check if embedding was found for at least one word in the tweet
            vectors.append(np.average(np.array(temp), axis=0))
        else:
            vectors.append(np.zeros(300, dtype = np.float64))
    else:
        vectors.append(np.zeros(300, dtype = np.float64))

vectors_cleaned = np.nan_to_num(vectors)
y = df.category.values
y = np.nan_to_num(y)

In [36]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(vectors_cleaned, y, test_size=0.2, random_state=42)

# Create model
clf = LogisticRegression(random_state=42).fit(np.array(X_train), np.array(y_train))
predicted_y = clf.predict(X_test)

# Evaluate model
print(accuracy_score(y_test, predicted_y))

0.6578721315498834


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## 7) Topic Modelling

In [39]:
import gensim
import gensim.corpora as corpora

dictionary = corpora.Dictionary(stop_words_removed) # encapsulates the mapping between normalized words and their int IDs

# Term Document Frequency
corpus = [dictionary.doc2bow(text) for text in stop_words_removed] # coverts words to bag of words format

lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                            id2word = dictionary,
                                            num_topics = 3,
                                            random_state = 42)

In [50]:
import pyLDAvis
import pyLDAvis.gensin

pyLDAvis.enable_notebook()
vis = pyLDAvis.prepare(lda_model, corpus, dictionary)
vis

  and should_run_async(code)


ModuleNotFoundError: No module named 'pyLDAvis.gensin'

## 8) Text Summarization

In [58]:
from string import punctuation
from collections import Counter
from heapq import nlargest

doc = """
Japan is an island country in East Asia, located in the northwest Pacific Ocean. 
It is bordered on the west by the Sea of Japan, and extends from the Sea of Okhotsk in the north toward the East China Sea and Taiwan in the south. 
Part of the Ring of Fire, Japan spans an archipelago of 6852 islands covering 377,975 square kilometers (145,937 sq mi); 
the five main islands are Hokkaido, Honshu, Shikoku, Kyushu, and Okinawa. 
Tokyo is Japan's capital and largest city; other major cities include Yokohama, Osaka, Nagoya, Sapporo, Fukuoka, Kobe, and Kyoto.
"""

doc = nlp(doc)

keyword = []
pos_tag = ['PROPN','ADJ','NOUN','VERB']
for token in doc:
    if(token.text in stop_words or token.text in punctuation):
        continue
    if (token.pos_ in pos_tag):
        keyword.append(token.text)

freq_word = Counter(keyword)

max_freq = Counter(keyword).most_common(1)[0][1]
for word in freq_word.keys():
    freq_word[word] = (freq_word[word]/max_freq)
    
sent_strength = {}
for sent in doc.sents:
    for word in sent:
        if word.text in freq_word.keys():
            if sent in sent_strength.keys():
                sent_strength[sent] += freq_word[word.text]
            else:
                sent_strength[sent] = freq_word[word.text]

print(sent_strength)

{
Japan is an island country in East Asia, located in the northwest Pacific Ocean.: 3.25, It is bordered on the west by the Sea of Japan, and extends from the Sea of Okhotsk in the north toward the East China Sea and Taiwan in the south.: 5.75, Part of the Ring of Fire, Japan spans an archipelago of 6852 islands covering 377,975 square kilometers (145,937 sq mi); 
the five main islands are Hokkaido, Honshu, Shikoku, Kyushu, and Okinawa.: 6.0, Tokyo is Japan's capital and largest city; other major cities include Yokohama, Osaka, Nagoya, Sapporo, Fukuoka, Kobe, and Kyoto.
: 4.5}


  and should_run_async(code)


## 9) Named Entity Recognition

In [51]:
doc = """
Japan is an island country in East Asia, located in the northwest Pacific Ocean. 
It is bordered on the west by the Sea of Japan, and extends from the Sea of Okhotsk in the north toward the East China Sea and Taiwan in the south. 
Part of the Ring of Fire, Japan spans an archipelago of 6852 islands covering 377,975 square kilometers (145,937 sq mi); 
the five main islands are Hokkaido, Honshu, Shikoku, Kyushu, and Okinawa. 
Tokyo is Japan's capital and largest city; other major cities include Yokohama, Osaka, Nagoya, Sapporo, Fukuoka, Kobe, and Kyoto.
"""

doc = nlp(doc)

entities = [(X.text, X.label_) for X in doc.ents]
for entity in entities:
    print(entity)

('Japan', 'GPE')
('East Asia', 'LOC')
('Pacific Ocean', 'LOC')
('the Sea of Japan', 'LOC')
('the Sea of Okhotsk', 'LOC')
('the East China Sea', 'LOC')
('Taiwan', 'GPE')
('Japan', 'GPE')
('6852', 'CARDINAL')
('377,975 square kilometers', 'QUANTITY')
('145,937', 'CARDINAL')
('five', 'CARDINAL')
('Hokkaido', 'ORG')
('Honshu', 'GPE')
('Shikoku', 'LOC')
('Kyushu', 'GPE')
('Okinawa', 'PERSON')
('Tokyo', 'GPE')
('Japan', 'GPE')
('Yokohama', 'GPE')
('Osaka', 'GPE')
('Nagoya', 'GPE')
('Sapporo', 'GPE')
('Fukuoka', 'GPE')
('Kobe', 'GPE')
('Kyoto', 'GPE')


  and should_run_async(code)
