In [1]:
import spacy
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
nlp=spacy.load('en')

In [2]:
#Load the data and inspect
df_train = pd.read_csv('Data Sets/train.csv')
df_test = pd.read_csv('Data Sets/test.csv')
# Feature Selection: Careful examination of the data reveals that the classification is purely based on the COMMENT
# and the CLASS. So we remove the other features.
df_train=df_train[['CONTENT','CLASS']]
df_test=df_test[['CONTENT']]
# Veiwing for the value counts to check for class imbalance. 
df_train['CLASS'].value_counts()
# No class imbalance present, so we can move forward with pre-processing the text data

1    586
0    571
Name: CLASS, dtype: int64

In [3]:
doc=nlp(df_train.loc[0,'CONTENT'])

In [4]:
for t in doc:
    print(t.text,t.pos_)

doc

Huh INTJ
, PUNCT
anyway INTJ
check VERB
out PART
this DET
you[tube NOUN
] PUNCT
channel NOUN
: PUNCT
kobyoshi02 NOUN


Huh, anyway check out this you[tube] channel: kobyoshi02

In [7]:
doc2=nlp(u'Huh, anyway check out this youtube channel: kobyoshi02')

In [8]:
for t in doc2:
    print(t.text,t.pos_,t.lemma_)

Huh INTJ huh
, PUNCT ,
anyway INTJ anyway
check VERB check
out PART out
this DET this
youtube NOUN youtube
channel NOUN channel
: PUNCT :
kobyoshi02 NOUN kobyoshi02


In [7]:
for e in doc2.ents:
    print(e,e.label_)

Youtube ORG


In [8]:
for c in doc2.noun_chunks:
    print(c)

this Youtube channel


In [9]:
from spacy import displacy

In [10]:
displacy.render(doc,style='dep',jupyter=True,options={'distance':110})

In [11]:
displacy.render(doc2,style='ent',jupyter=True,options={'distance':110})

In [12]:
displacy.serve(doc2)

  "__main__", mod_spec)



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [13]:
for t in doc:
    print(t.text,t.pos_,t.lemma,t.lemma_)

Huh INTJ 14763614096903878882 huh
, PUNCT 2593208677638477497 ,
anyway INTJ 12129547927431362550 anyway
check VERB 13320680580156776400 check
out PART 1696981056005371314 out
this DET 1995909169258310477 this
you[tube NOUN 493211788066837330 you[tube
] PUNCT 3806482680584466996 ]
channel NOUN 7635442270831496872 channel
: PUNCT 11532473245541075862 :
kobyoshi02 NOUN 959071376488098283 kobyoshi02


In [14]:
from spacy.matcher import Matcher

In [15]:
matcher=Matcher(nlp.vocab)

In [16]:
pattern1=[{'LOWER':'solarpower'}] # Solarpower
pattern2=[{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}] # Solar-power
pattern3=[{'LOWER':'solar'},{'LOWER':'power'}] # solar power

In [17]:
matcher.add('SolarPower',None,pattern1,pattern2,pattern3)

In [18]:
doc=nlp(u'The Solar power industry uses solar-power. SolarPower is amazing.')

In [19]:
found_match=matcher(doc)

In [20]:
print(found_match)

[(8656102463236116519, 1, 3), (8656102463236116519, 5, 8), (8656102463236116519, 9, 10)]


In [21]:
for m_i,s,e in found_match:
    print(nlp.vocab.strings[m_i],doc[s:e].text)

SolarPower Solar power
SolarPower solar-power
SolarPower SolarPower


In [22]:
len(nlp.Defaults.stop_words)

326

In [13]:
from spacy.matcher import PhraseMatcher

In [14]:
p_match=PhraseMatcher(nlp.vocab)

In [15]:
with open('reaganomics.txt') as f:
    doc3=nlp(f.read())

In [16]:
doc3

REAGANOMICS
https://en.wikipedia.org/wiki/Reaganomics

Reaganomics (a portmanteau of [Ronald] Reagan and economics attributed to Paul Harvey)[1] refers to the economic policies promoted by U.S. President Ronald Reagan during the 1980s. These policies are commonly associated with supply-side economics, referred to as trickle-down economics or voodoo economics by political opponents, and free-market economics by political advocates.

The four pillars of Reagan's economic policy were to reduce the growth of government spending, reduce the federal income tax and capital gains tax, reduce government regulation, and tighten the money supply in order to reduce inflation.[2]

The results of Reaganomics are still debated. Supporters point to the end of stagflation, stronger GDP growth, and an entrepreneur revolution in the decades that followed.[3][4] Critics point to the widening income gap, an atmosphere of greed, and the national debt tripling in eight years which ultimately reversed the pos

In [17]:
phrase_to_find=['economic activity','circa 1980','voodoo economics']

In [18]:
phrase = [nlp(t) for t in phrase_to_find]

In [19]:
p_match.add('PMatch',None,*phrase)

In [20]:
found=p_match(doc3)

In [24]:
for m_i,s,e in found:
    print(m_i,nlp.vocab.strings[m_i],doc3[s:e].text,s,e)

3199586003214607400 PMatch voodoo economics 54 56
3199586003214607400 PMatch circa 1980 5711 5713


In [None]:
def reduce_to_double_max(text):
    """Removes unecessary doubling/tripling/etc of characters
    
    Steps:
        1. Replaces every 3+ consecutive identical chars by 2 consecutive identical chars
        2. Replaces every 2+ consecutive non-word character by a single
    """
    text = re.sub(r'(\w)\1{2,}', r'\1\1', text)
    return re.sub(r'(\W)\1+', r'\1', text)

def preprocess_corpus(corpus):
    """Applies all preprocessing rules to the corpus"""
    corpus = (reduce_to_double_max(s.lower()) for s in corpus)
    docs = nlp.pipe(corpus, batch_size=1000, n_threads=4)
    return [' '.join([x.lemma_ for x in doc if x.is_alpha]) for doc in docs]


nlp.pipeline

doc=nlp(u'Hi, I am Yatin! Nice to meet you. I won Rs1000.')

for i in doc:
    print(i.text,i.pos_,i.dep_)



train_processed = preprocess_corpus(df_train['CONTENT'])

train_processed

df_train['Processed']=train_processed

df_train.head()

df_test['Processed'] = preprocess_corpus(df_test['CONTENT'])

df_test.head()

from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, model_from_json
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing.text import Tokenizer

t = Tokenizer()
t.fit_on_texts(df_train['Processed'])
vocab_size = len(t.word_index) + 1

vocab_size

encoded_docs = t.texts_to_sequences(df_train['Processed'])
max_length = 100
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

# Test Data
test_encoded_docs = t.texts_to_sequences(df_test['Processed'])
test_padded_docs = pad_sequences(test_encoded_docs, maxlen=max_length, padding='post')

EMBEDDING_PATH = "./Data Sets/Test/glove.6B.50d.txt"

embeddings_index = dict()
with open(EMBEDDING_PATH,'rb') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs


f

print('Loaded %s word vectors.' % len(embeddings_index))

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 50))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

print(vocab_size)

model = Sequential()
#Embedding Layer. This layer will output the word vectors for each one of the words in the sentence
model.add(Embedding(vocab_size, 
                    50, weights=[embedding_matrix], 
                    input_length=100, 
                    trainable=False))

model.add(Bidirectional(LSTM(units=50, return_sequences=False, dropout=0.1, recurrent_dropout=0.1)))
model.add(Dense(50, activation="relu"))
model.add(Dropout(0.1))

model.add(Dense(1, activation='sigmoid'))

optimizer = optimizers.Adam(lr=0.001)
model.compile(loss='binary_crossentropy',
            optimizer=optimizer,
            metrics=['accuracy'])

model.summary()

model.fit(padded_docs, df_train['CLASS'], epochs=2, batch_size=32)

import spacy


nlp=spacy.load('en',disable=['parser', 'ner', 'textcat'])

def reduce_to_double_max(text):
    """Removes unecessary doubling/tripling/etc of characters
    
    Steps:
        1. Replaces every 3+ consecutive identical chars by 2 consecutive identical chars
        2. Replaces every 2+ consecutive non-word character by a single
    """
    text = re.sub(r'(\w)\1{2,}', r'\1\1', text)
    return re.sub(r'(\W)\1+', r'\1', text)

def preprocess_corpus(corpus):
    """Applies all preprocessing rules to the corpus"""
    corpus = (reduce_to_double_max(s.lower()) for s in corpus)
    docs = nlp.pipe(corpus, batch_size=1000, n_threads=4)
    return [' '.join([x.lemma_ for x in doc if x.is_alpha]) for doc in docs]


nlp.pipeline

doc=nlp(u'Hi, I am Yatin! Nice to meet you. I won Rs1000.')

for i in doc:
    print(i.text,i.pos_,i.dep_)

#Load the data and inspect
df_train = pd.read_csv('Data Sets/train.csv')
df_test = pd.read_csv('Data Sets/test.csv')
# Feature Selection: Careful examination of the data reveals that the classification is purely based on the COMMENT
# and the CLASS. So we remove the other features.
df_train=df_train[['CONTENT','CLASS']]
df_test=df_test[['CONTENT']]
# Veiwing for the value counts to check for class imbalance. 
df_train['CLASS'].value_counts()
# No class imbalance present, so we can move forward with pre-processing the text data

train_processed = preprocess_corpus(df_train['CONTENT'])

train_processed

df_train['Processed']=train_processed

df_train.head()

df_test['Processed'] = preprocess_corpus(df_test['CONTENT'])

df_test.head()



In [61]:
b=[1,2,3,4]

In [60]:
def a(*args):
    for i in args:
        print(i)

In [62]:
a(*b)

1
2
3
4


In [None]:
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, model_from_json
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing.text import Tokenizer

t = Tokenizer()
t.fit_on_texts(df_train['Processed'])
vocab_size = len(t.word_index) + 1

vocab_size

encoded_docs = t.texts_to_sequences(df_train['Processed'])
max_length = 100
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

# Test Data
test_encoded_docs = t.texts_to_sequences(df_test['Processed'])
test_padded_docs = pad_sequences(test_encoded_docs, maxlen=max_length, padding='post')

EMBEDDING_PATH = "./Data Sets/Test/glove.6B.50d.txt"

embeddings_index = dict()
with open(EMBEDDING_PATH,'rb') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs


f

print('Loaded %s word vectors.' % len(embeddings_index))

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 50))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

print(vocab_size)

model = Sequential()
#Embedding Layer. This layer will output the word vectors for each one of the words in the sentence
model.add(Embedding(vocab_size, 
                    50, weights=[embedding_matrix], 
                    input_length=100, 
                    trainable=False))

model.add(Bidirectional(LSTM(units=50, return_sequences=False, dropout=0.1, recurrent_dropout=0.1)))
model.add(Dense(50, activation="relu"))
model.add(Dropout(0.1))

model.add(Dense(1, activation='sigmoid'))

optimizer = optimizers.Adam(lr=0.001)
model.compile(loss='binary_crossentropy',
            optimizer=optimizer,
            metrics=['accuracy'])

model.summary()

model.fit(padded_docs, df_train['CLASS'], epochs=2, batch_size=32)