In [1]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from collections import Counter
import numpy as np

In [4]:
import sys
#!conda install --yes --prefix {sys.prefix} torchvision
!{sys.executable} -m pip install --user --upgrade torch torchvision torchaudio



In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
text = """lolol this quiz has got to be most ridiculous quiz I have ever done in my life.
never had I just looked at the qn and select the first option that I see and click next.
that's how much of a rush we were in lmaoo. I thought can crtl f in my pdf file. 
lmao i was so wrong. I don't even have time to think bruh

"""
doc = nlp(text)
senlist = [sen for sen in doc.sents]

In [4]:
word_freq = Counter(senlist[1])
word_freq.most_common(5)
unique_words = [word for (word, freq) in word_freq.items() if freq == 1]
unique_words

[,
 never,
 had,
 I,
 just,
 looked,
 at,
 the,
 qn,
 and,
 select,
 the,
 first,
 option,
 that,
 I,
 see,
 and,
 click,
 next,
 .]

In [9]:
adj_list = []
for sen in senlist:
    for token in sen:
        if token.pos_ == "ADJ":
            adj_list.append(token.text)
adj_list

['ridiculous', 'first', 'much', 'wrong']

In [3]:
#Pre-processing
def is_token_allowed(token):
    if (not token or not token.text.strip() or token.is_stop or token.is_punct):
        return False
    return True
def preprocess_token(token):
    return token.lemma_.strip().lower()

In [4]:
complete_filtered_tokens = [preprocess_token(token) for token in doc if is_token_allowed(token)]
complete_filtered_tokens

['lolol',
 'quiz',
 'get',
 'ridiculous',
 'quiz',
 'life',
 'look',
 'qn',
 'select',
 'option',
 'click',
 'rush',
 'lmaoo',
 'think',
 'crtl',
 'f',
 'pdf',
 'file',
 'lmao',
 'wrong',
 'time',
 'think',
 'bruh']

In [34]:
#Word Vectors
your_word = "dog"

ms = nlp.vocab.vectors.most_similar(
    np.asarray([nlp.vocab.vectors[nlp.vocab.strings[your_word]]]), n=10) #show the first 10 most similar words to dog
words = [nlp.vocab.strings[w] for w in ms[0][0]]
print(words)

['dog', 'KENNEL', 'dogs', 'CANINES', 'GREYHOUND', 'pet', 'Pet-Care', 'FELINE', 'cat', 'BEAGLES']


In [29]:
#Similarity
nlp("dog kennel").similarity(nlp("Pooches pet"))

0.8634041115001581

In [35]:
#Pipeline
nlp = spacy.blank("en") #creating a blank pipeline
nlp.add_pipe("sentencizer") #creating your model by adding modifiers

import requests
from bs4 import BeautifulSoup
s = requests.get("https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt")
soup = BeautifulSoup(s.content).text.replace("-\n", "").replace("\n", " ")[:30]
doc = nlp(soup)

#analyse pipelines
nlp2.analyze_pipes()

'This is the 100th Etext file p'

In [46]:
for ent in nlp("(555) 555-5555").ents:
    print(ent.text, ent.label_)

555 CARDINAL


In [47]:
#Rules-Based SpaCy
#EntityRuler
nlp = spacy.load("en_core_web_sm")
#Create the Ruler and Add it
ruler = nlp.add_pipe("entity_ruler", before = "ner")
#List of Entities and Patterns (source: https://spacy.io/usage/rule-based-matching)
patterns = [
                {"label": "PHONE_NUMBER", "pattern": [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
                {"ORTH": "-", "OP": "?"}, {"SHAPE": "dddd"}]},
                {"label": "FILM", "pattern": "Mr. Deeds"}
            ]
#add patterns to ruler
ruler.add_patterns(patterns)

In [48]:
for ent in nlp("(555) 555-5555").ents:
    print(ent.text, ent.label_)

(555) 555-5555 PHONE_NUMBER


In [62]:
#Matcher, takes into account linguistic components
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"LIKE_EMAIL": True}]
matcher.add("EMAIL_ADDRESS", [pattern])
doc = nlp("This is an email address: wmattingly@aol.com")
matches = matcher(doc)
print(matches, nlp.vocab[matches[0][0]].text, doc[matches[0][1]:matches[0][2]]) #Lexeme, start token, end token

#finding all proper nouns followed by a verb
text = "King Arthur pulled the sword out. This is impressive. Merlin taught him to do that."
    
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP": "+"}, {"POS": "VERB"}]
matcher.add("PROPER_NOUNS", [pattern], greedy='LONGEST')
doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])
for match in matches[:10]:
    print (match, doc[match[1]:match[2]])
    
#finding all speeches and their speaker
text = "King Arthur pulled the sword out. This is impressive. Merlin taught him to do that. 'Oh nyo!' thought Arthur 'Merlin saw me'"

speak_lemmas = ["think", "say"]
matcher = Matcher(nlp.vocab)
pattern1 = [{'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}, {"POS": "VERB", "LEMMA": {"IN": speak_lemmas}}, {"POS": "PROPN", "OP": "+"}, {'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}]
pattern2 = [{'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}, {"POS": "VERB", "LEMMA": {"IN": speak_lemmas}}, {"POS": "PROPN", "OP": "+"}]
pattern3 = [{"POS": "PROPN", "OP": "+"},{"POS": "VERB", "LEMMA": {"IN": speak_lemmas}}, {'ORTH': "'"}, {'IS_ALPHA': True, "OP": "+"}, {'IS_PUNCT': True, "OP": "*"}, {'ORTH': "'"}]
matcher.add("PROPER_NOUNS", [pattern1, pattern2, pattern3], greedy='LONGEST')
doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])
for match in matches[:10]:
    print (match, doc[match[1]:match[2]])

[(16571425990740197027, 6, 7)] EMAIL_ADDRESS wmattingly@aol.com
(3232560085755078826, 0, 3) King Arthur pulled
(3232560085755078826, 11, 13) Merlin taught
(3232560085755078826, 18, 30) 'Oh nyo!' thought Arthur 'Merlin saw me'


In [86]:
#Custom Components
from spacy.language import Language

nlp = spacy.load("en_core_web_sm")
doc = nlp("Britain is a place. Tom is a doctor.")
for ent in doc.ents:
    print(ent.text, ent.label_)
    
@Language.component("remove_gpe") #decorator
def remove_gpe(doc):
    original_ents = list(doc.ents)
    for ent in doc.ents:
        if ent.label_ == "GPE":
            original_ents.remove(ent)
    doc.ents = original_ents
    return doc
#print(nlp.analyze_pipes())

nlp.add_pipe("remove_gpe", last = True)
print(nlp.pipe_names)
doc = nlp("Britain is a place. Tom is a doctor.")
for ent in doc.ents:
    print(ent.text, ent.label_)

Britain GPE
Tom PERSON
['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer', 'remove_gpe']
Tom PERSON


In [94]:
#RegEx, independent of linguistic features
import re
pattern = r"((\d){1,2} (January|February|March|April|May|June|July|August|September|October|November|December))"

text = "This is a date 2 February. Another date would be 14 August."
matches = re.findall(pattern, text)
print (matches)

text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."
pattern = r"Paul [A-Z]\w+"
matches = re.finditer(pattern, text)
for match in matches:
    print(match)
    
from spacy.tokens import Span
nlp = spacy.blank("en")
doc = nlp(text)

@Language.component("paul_ner")
def paul_ner(doc):
    original_ents = list(doc.ents)
    mwt_ents = []
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="PERSON")
        original_ents.append(per_ent)
    doc.ents = original_ents
    return doc

nlp = spacy.blank("en")
nlp.add_pipe("paul_ner")
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

[('2 February', '2', 'February'), ('14 August', '4', 'August')]
<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>
Paul Newman PERSON
Paul Hollywood PERSON


## NLTK

In [15]:
import nltk

In [16]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to C:\Users\Yan
[nltk_data]     Rong\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\twitter_samples.zip.


True

In [20]:
from nltk.corpus import twitter_samples

In [39]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

In [43]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(positive_tweets)
negative_tokens_for_model = get_tweets_for_model(negative_tweets)
positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]
positive_dataset[0]

({'#': True,
  'F': True,
  'o': True,
  'l': True,
  'w': True,
  'r': True,
  'i': True,
  'd': True,
  'a': True,
  'y': True,
  ' ': True,
  '@': True,
  'n': True,
  'c': True,
  'e': True,
  '_': True,
  'I': True,
  't': True,
  'P': True,
  'K': True,
  'u': True,
  'h': True,
  '5': True,
  '7': True,
  'M': True,
  'p': True,
  's': True,
  'f': True,
  'b': True,
  'g': True,
  'm': True,
  'k': True,
  ':': True,
  ')': True},
 'Positive')

In [38]:
from nltk import classify
from nltk import NaiveBayesClassifier

In [46]:
classifier = NaiveBayesClassifier.train(positive_dataset)

#print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Most Informative Features
None


In [37]:
##textblob
from scrapeReddit import *

# Function for sentiment analysis (TextBlob -> pre trained)
def sentAnalysis(string):
    blob = TextBlob(string)
    sentiment = blob.sentiment.polarity
    return round(sentiment,2)

# Function to scrape posts and give sentiment score based on Post Text 
def postSentiment(subreddit, limitNum, analysisFunc):
    modPosts_dict = {"Title": [], "Post Text": [], "Score": [], "ID":[], "Post Object":[],
              "Sentimental Score": [], "Post URL": [], "Comment Sentiment": []
              }
    for post in subreddit.hot(limit=limitNum):
        for word in post.title.split(" "):  #for each word in the post title(its type is list)
            if filterPost(modPosts_dict, word, post.id, "ID"):
                
                sentimentPost = analysisFunc(post.selftext)
                sentimentComment = commentSentiment(post, analysisFunc)

                modPosts_dict["Title"].append(post.title)
                modPosts_dict["Post Text"].append(post.selftext)
                modPosts_dict["Score"].append(post.score)
                modPosts_dict["Sentimental Score"].append(sentimentPost)
                modPosts_dict["Post URL"].append(post.url)
                modPosts_dict["ID"].append(post.id)
                modPosts_dict["Post Object"].append(post)
                modPosts_dict["Comment Sentiment"].append(sentimentComment)


    posts = pd.DataFrame(modPosts_dict)
    return posts

def commentSentiment(post, analysisFunc):
    commentsDF = getComments(post)
    lst = []
    scores = []
    for comment in commentsDF["Comment Text"]:
        firstWord = comment.split(" ")[0]
        sentiment = analysisFunc(comment)
        scores.append(sentiment)
        scores.append(firstWord)
        lst.append(scores)
        scores = []
    return lst


if __name__ == "__main__":
    #find_top_five(nus_sub)
    #print(scrapeModPosts(nus_sub, 200))
    print(postSentiment(nus_sub, 50, sentAnalysis)["Score"])


#problem = code crashes when limit is set high

0    2
Name: Score, dtype: int64
