In [None]:
import difflib
import nltk
from nltk.tokenize import TweetTokenizer
import regex as re
from pprint import pprint
from collections import Counter
import pandas as pd

In [None]:
with open("pattern.txt", "r") as f:
    pat = re.compile(f.read())


with open("tweeteval/datasets/irony/train_text.txt", "r") as f:
    irony = f.read()

corp_irony = re.findall(pat, irony)

with open("tweeteval/datasets/stance/climate/train_text.txt", "r") as f:
    stance_climate = f.read()

corp_stance = re.findall(pat, stance_climate)

with open("tweeteval/datasets/stance/hillary/train_text.txt", "r") as f:
    stance_hillary = f.read()

corp_hillary = re.findall(pat, stance_hillary)

test = re.findall(pat, "#624 #4anyone #26 #23842 #2 #44444 #anyone")

In [None]:
print(test)

['2', '#anyone']


In [None]:
nltk_token = TweetTokenizer()
nltk_irony = nltk_token.tokenize(irony)
nltk_stance = nltk_token.tokenize(stance_climate)

In [None]:
nltk_hillary = nltk_token.tokenize(stance_hillary)



In [None]:
def find_uniques(voc1, voc2):
    uniq_1, uniq_2 = [], []
    for word in set(voc1):
        if word not in set(voc2):
            uniq_1.append(word)
    for word in set(voc2):
        if word not in set(voc1):
            uniq_2.append(word)

    count1, count2 = {word: 0 for word in uniq_1}, {word: 0 for word in uniq_2}
    
    for word in voc1:
        if word in set(uniq_1):
            count1[word] += 1
    for word in voc2:
        if word in set(uniq_2):
            count2[word] += 1
    
    count1 = {k: v for k, v in sorted(count1.items(), key=lambda item: item[1], reverse=True) if v > 2}
    count2 = {k: v for k, v in sorted(count2.items(), key=lambda item: item[1], reverse=True) if v > 2}

    return count1, count2

our, nltk = find_uniques(corp_hillary, nltk_hillary)

In [None]:
print(f"Unique to our: {our}\nUnique to nltk: {nltk}")
print(len(our))
print(len(nltk))

Unique to our: {'Hillary Clinton': 18, '!!!': 18, '!!': 17, 'w/': 8, '??': 6, 'White House': 4, '!!!!': 4, 'HILLARY CLINTON': 3, 'THE TRUTH': 3, 'NOTHING TO': 3, 'Bill Clinton': 3, 'Clinton Foundation': 3, 'Fox News': 3}
Unique to nltk: {'.': 424, '!': 248, ',': 211, '?': 122, '"': 80, '&': 40, '-': 31, ':': 23, "'": 20, '/': 14, '..': 14, ')': 13, '(': 11, 'w': 8, '%': 7, 'THE': 7, ';': 6, '=': 6, 'OF': 5, 'News': 5, 'House': 4, 'CLINTON': 3, 'NOTHING': 3, 'Foundation': 3, 'SLOWLY': 3, 'Campaign': 3, '#happy_life': 3, '#freedom_justice_equality_education': 3, '10': 3, 'HIDE': 3, 'Fox': 3, '___': 3, '>': 3, 'TRUTH': 3, '11': 3, 'AND': 3, '1st': 3}
13
37


When comparing our own tokenizer to the TweetTokenizer from the nltk library, we were interested in seeing how many unique tokens one method picked up that the other did not. Moreover, we figured that many of the unique tokens that one tokenizer picked up on, might just appear once or twice in the whole corpus. Therefore, we investigated how many unique words one tokenizer picked up on more than two times. 

We tested the tokenizers on the Hillary stance training set. Our tokenizer found 13 unique tokens appearing more than twice in the corpus that the NLTK tokenizer did not pick up on. Meanwhile, the NLTK tokenizer found 37 unique tokens appearing more than twice that our tokenizer did not pick up on. The main differences in the most frequent tokens in the two tokenizers were (1) the NLTK tokenizer picked up on symbols such as "!", ".", "?" while ours purposely did not. (2) our initial idea was to pick up on proper names such as "New York" and "Hillary Clinton" and also to extract meaning from things such as ellipses or multiple exclamation marks. This is reflected in the most frequent tokens unique to our tokenizer.

Later, we modified our preprocessing method to convert all letters in the corpus to lowercase. This made our decision to group proper names to one token redundant. 

In [None]:
def normalise_punct(corp):
    # Converts all instances of -,/,(,),!,+,&,? and , that are 2 or
    # more characters of to only 2 characters.

    pattern_punct = r"(([-\/()!+,&?])\2+)"
    pat = re.compile(pattern_punct)
    for i, j in enumerate(corp):
        match = pat.search(j)
        if match:
            corp[i] = corp[i][:2]


def convert_w(corp):
    for i in corp:
        if i == "w/":
            corp[i] = "with"

        if i == "w/o":
            corp[i] = "without"


In [None]:
d = difflib.Differ()
result = list(d.compare(corp_irony, nltk_irony))
same_c, uniq_nltk, uniq_our, not_present = 0, 0, 0, 0
same_lst, nltk_lst, our_lst, not_lst = [], [], [], []
for line in result:
    if line.startswith(" "):
        same_lst.append(line)
        same_c += 1
    if line.startswith("+"):
        nltk_lst.append(line)
        uniq_nltk += 1
    if line.startswith("-"):
        our_lst.append(line)
        uniq_our += 1
    if line.startswith("?"):
        not_lst.append(line)
        not_present += 1

print("Agreed: ",same_c, "\nUnique to NLTK: ",uniq_nltk, "\nUnique to our: ",uniq_our, "\nNot caught by any: ",not_present)

Agreed:  37691 
Unique to NLTK:  7331 
Unique to our:  1253 
Not caught by any:  345


In [None]:
d = difflib.Differ()
result = list(d.compare(corp_irony, nltk_irony))

# same_lst = [line for line in result if line.startswith(" ")]
# nltk_lst = [line for line in result if line.startswith("+")]
# our_lst = [line for line in result if line.startswith("-")]
# not_lst = [line for line in result if line.startswith("?")]

# same_c = len(same_lst)
# uniq_nltk = len(nltk_lst)
# uniq_our = len(our_lst)
# not_present = len(not_lst)

# print(same_c, uniq_nltk, uniq_our, not_present)

# print(uniq_our + uniq_nltk)

# print(our_lst)


In [None]:
test_str = """All I can say is I'm lucky. 
#notcies #eu Liverpool workers "miscarriage of justice" victims 
@user yesss! What makes it worse is when they look gorgeous and say "I just got my face beat!" Wtf?  #Contradiction 
Xmas on the blog feat @user and @user * Read our story and share the LOVE ‚ù§Ô∏è Click the link... 
Watching the move 'Begin Again' and the verisimilitude is overpowering, it's like I'm back in the 90s music business again.   
This is not the moon. Pictures like the moon is made of light bulbs.  #the #moon :) 
I got ready and then got to school and parked in less than 12 minutes! #miracle 
@user yeah. So as you can see, I have great success with the ladies! And I'm totally excited for having sex some more!  
I don't like clowns but I'm going to be one.  
MLS Transactions 2015 #MLS making waves again   2 b fair it will take more than 2 players to fix this 
@user I'll be a bit sweaty by the time I get to you! 
 a bad game last night. Way to go Packers! 
I hope I could recover from fever today. I need to start with strama.. 
somebody wake me up early tomorrow ive been facing weird aches in my back since early december .,. and why do u think that relates madaka 
The Champions League is overrated anyway!  
Porygon2 are  found in the www.monstermmorpg. com wild. #firemen follow @user #paint 
@user are you looking at the wrong profile picture? 
@user One day I want to travel with my bestfriend üåè‚úàÔ∏è DONE DID TRAVELED DA WORLD!! @user ‚ù§Ô∏è 
@user u simply cant win with @user if it is twitter fight!!! :-P 
Fully charged my #Anker portable charger...it lasted 1/2 an hour.  Awesome #Fail 
#Italy -- #Cabinet #approves #first #planks of #Renzi's #labour #reform. via @user 
ruling party in power#central#state#misusing their power#PM speaking only in foreign parliment#pm to visit out side india during session 
Gareth's polar opposite is a chicken-loving vegetarian üòÇüê£  #Bones @user 
Watching creepy shit before bed when alone = bad idea. Is there a spell to turn a French bulldog into a big ass bulldog? #bewareofdog  
i do occupy rent free space in his cranial cavity LOL @user 
Had to take a #PatioPics - snow falling still. This was totally clear when I went to sleep. #WCCO 
August has the most birthdays, February has the least and most of the serial killers are born in November!||-so dont mess up with me|#nov26 
Lol RT @user Wouldn't surprise me if Soldado bangs in a hatrick and we win 0-3 against Chelsea tonight .. The legend is back 
My husband thinks I'm crazy because I taped my tape dispenser. Hehe. I'm handy like that. .... 
My secret name is lizard squad. I like to ruin people's fun time. Follow and rt to a billion and you'll have fun. #psn  #giveitup 
Tomorrow's afternoon #NFL sked in #PanamaCity area: WECP 12p #KCvsPIT, 3:25p #INDvsDAL; WPGX 12p #ATLvsNO. 
@user @user I sure hope ev1 vaccinations are up to date! #GoBolts @user @user @user 
Pulis turned down #NUFC cos he wants to spend a load of money on 30 year old journeymen. Parish wouldn't let him & neither would MA. #cpfc 
Sending best wishes to all my coworkers at the 9AM this morning  
@user try having no internet for a month. Now I know how Ethiopians feel.  
so, sane peoples would talk to themselves in twitter because they can't find other sane humans to talk to. that  #retweet#ifagree 
Thanks @user for connecting. Always look forward to exchange thoughts n ideas with #entrepreneur working on #green n #sustainability 
Seems as if @user wants to endorse me on LinkedIn for  - any thoughts on this from the #OMCchat crowd? 
Love being made fun of  
@user @user @user Oh wow your talking Skype! Cool!  
he was half of what she deserved, yet he was all that she ever wanted ,,,,  
5:30AM 00:00 12PM
#Christmas  #been #the #best @ West Monkseaton 
Parking meter obviously forgot to get its own parking ticket.  
About to fuck up this Media exam  #actuallyihopeso 
well today is gonna be a great day üëå  
Heaven help the fool who did her wrong 
Just bartered for a bottle of rum in best one, and got it down from ¬£18 to ¬£14. Happy Fucking New Year to me!! 
find ONE local PD that reported an 80% drop @user @user @user @user @user @user 
@user @user Money 4 Church|http://t.co/Q2WB7riAvK|SmartPhone APP PAYS you!|See-http://t.co/RDlRuGN0iE |Go 2: 
Well it's always a good time losing at the Bay... @user @user @user 
Welsh devolution? How's this for starters... 
I love when folks call Brady a system QB but are THE BIGGEST Peyton Manning fans. . 
@user Instead of playing the pompous "do you know who I am card?" , how about you actually make an educated rebuttal? 
Kind of love how I got a voicemail from my seat neighbor wondering where I was yet they constantly sell their ticket & I never ask  
I feel a nap in my near future. #NapTime 
#AnalScreen #Exotic Exotic brunette gets her little tight butt nailed right on the office desk 
#sundayfunday #mylove #mermaidlove #newyear2015 @ Rockefeller Center 
The ever so caring @user gets to see the siege ending first. Great journalism.  
www.google.com
@user You truly are my son. 
hmmm. I do wonder why Astec has one fewer employee? #lol  
#Germany -- #ECB's #Weidmann #says #German #2015 #growth #may #be #better #than #expected. via @user 
Kevin Durant with 23pts on 8-13 shooting, has this nigga been inefficient since he came.  
This chap seems to be a bit of an over sexed out going extrovert ... Must be his overly masculine voice and demeanor.   
Damit, this fatima bhutto has an instagram account but not pics of her. Some random shit...and then ppl i follow keep posting pics.  
@user so funny lolololol  
@user @user what the hell ever. 
On my lunch break so sleepyüò¥ 
@user @user More clean OR cleaner, never more cleaner  
Amen, that's due to them  having respect for themselves.  ;P ;p :p 8D xD """

link_line = """https://github.com/nltk/nltk/blob/develop/nltk/tokenize/casual.py
http://github .com/nltk/nltk/blob/develop/nltk/tokenize/casual.py
www.github.com/nltk/nltk/blob/develop/nltk/tokenize/casual.py
www.google.org
www.google.com
http://www.github.com/nltk/nltk/blob/develop/nltk.py"""


In [None]:
line = irony

tokens = []
unmatchable = []

mybe_everything = r"""(https?[:.]?\s?\/\/(?:\s*[^\/\s.]+)+(?:\s*\.\s*[^\/\s.]+)*(?:\s*\/\s*[^\/\s]+)*)|(www?[\..]?\s?\/\/(?:\s*[^\/\s.]+)+(?:\s*\.\s*[^\/\s.]+)*(?:\s*\/\s*[^\/\s]+)*)|(?:[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?|</?3)|[\U0000263a-\U000e007f]|\w+[\.,]\S+|\b\d{1,3}%|(?:1)?0\/10|(?<![!\-:/])\b\d{1}\b(?![!\-:/])|(?<!#)\b[A-Z]\w+\s[A-Z]\p{L}\w*\b|w\/|w\/o|['\"]\b\p{L}+\b(?:['‚Äô]\b\w+\b)?(?:-\b\w+\b)?['\"]|@?#?\b\p{L}+\d*\b(?:['‚Äô]\b\w+\b)?(?:-(?!http)\b\w+\b)?|([A-Z][a-z]+(?=\s[A-Z])(?:\s[A-Z][a-z]+)+)|(\w+[¬¥`']\w+)|(\w+[\.,]\S+)|([\.,?!]+)|(\w+)|(#\w+)|(@\w+)|(@)|([¬¥`'\"].*?[¬¥`'\"])"""
test = r"""(https?[:.]?\s?\/\/(?:\s*[^\/\s.]+)+(?:\s*\.\s*[^\/\s.]+)*(?:\s*\/\s*[^\/\s]+)*)|(www?[\..]?\s?\/\/(?:\s*[^\/\s.]+)+(?:\s*\.\s*[^\/\s.]+)*(?:\s*\/\s*[^\/\s]+)*)|(?:[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?|</?3)|[\U0000263a-\U000e007f]|\w+[\.,]\S+|\b\d{1,3}%|(?:1)?0\/10|(?<![!\-:/])\b\d{1}\b(?![!\-:/])|(?<!#)\b[A-Z]\w+\s[A-Z]\p{L}\w*\b|w\/|w\/o|['\"]\b\p{L}+\b(?:['‚Äô]\b\w+\b)?(?:-\b\w+\b)?['\"]|@?#?\b\p{L}+\d*\b(?:['‚Äô]\b\w+\b)?(?:-(?!http)\b\w+\b)?|([A-Z][a-z]+(?=\s[A-Z])(?:\s[A-Z][a-z]+)+)|(\d+-\d+)|(\d+/\d+)|(\d+(:?\d+)?((AM)?(PM)?[ps]?))|[\$\¬£\‚Ç¨](\d+(?:\.\d{1,2})?)|(\w+[¬¥`']\w+)|(\w+[\.,]\S+)|([\.,?&!\-\*:;\|=]+)|(\w+)|(#\w+)|(@\w+)|(@)|([¬¥`'\"].*?[¬¥`'\"])"""

with open("pattern2.txt", "r") as f:
    token_pat = re.compile(f.read())

skippable_pat = re.compile(r'\s+')

# As long as there's any material left...
while line:
    # Try finding a skippable token delimiter first.
    skippable_match = re.search(skippable_pat, line)
    if skippable_match and skippable_match.start() == 0:
        # If there is one at the beginning of the line, just skip it.
        line = line[skippable_match.end():]
    else:
        # Else try finding a real token.
        token_match = re.search(token_pat, line)
        if token_match and token_match.start() == 0:
            # If there is one at the beginning of the line, tokenise it.
            tokens.append(line[:token_match.end()])
            line = line[token_match.end():]
        else:
            # Else there is unmatchable material here.
            # It ends where a skippable or token match starts, or at the end of the line.
            unmatchable_end = len(line)
            if skippable_match:
                unmatchable_end = skippable_match.start()
            if token_match:
                unmatchable_end = min(unmatchable_end, token_match.start())
            # Add it to unmatchable and discard from line.
            unmatchable.append(line[:unmatchable_end])
            line = line[unmatchable_end:]
# tokens = re.findall(token_pat, line)
# unmatchable = re.findall(skippable_pat, line)

# print("\n" + "MATCHED" + "\n" + "-"*100 + "\n")
# print(tokens)
# print("\n" + "UNMATCHED" + "\n" + "-"*100 + "\n")
# print(unmatchable)

d = difflib.Differ()
result = list(d.compare(corp_irony, nltk_irony))

same_lst = [lin for lin in result if lin.startswith(" ")]
nltk_lst = [lin for lin in result if lin.startswith("+")]
our_lst = [lin for lin in result if lin.startswith("-")]
not_lst = [lin for lin in result if lin.startswith("?")]

same_c = len(same_lst)
uniq_nltk = len(nltk_lst)
uniq_our = len(our_lst)
not_present = len(not_lst)

print(corp_irony)

['seeing', 'ppl', 'walking', 'w/', 'crutches', 'makes', 'me', 'really', 'excited', 'for', 'the', 'next', '3', 'weeks', 'of', 'my', 'life', 'look', 'for', 'the', 'girl', 'with', 'the', 'broken', 'smile', 'ask', 'her', 'if', 'she', 'wants', 'to', 'stay', 'while', 'and', 'she', 'will', 'be', 'loved', 'üíï', 'üéµ', 'Now', 'I', 'remember', 'why', 'I', 'buy', 'books', 'online', '@user', '#servicewithasmile', '@user', '@user', 'So', 'is', 'he', 'banded', 'from', 'wearing', 'the', 'clothes', '#Karma', 'Just', 'found', 'out', 'there', 'are', 'Etch', 'A', 'Sketch', 'apps', '#oldschool', '#notoldschool', 'Hey', 'what', 'do', 'you', 'know', 'one', 'of', 'the', 'witnesses', 'supporting', 'Darren Wilson', 's', 'story', 'lied', 'And', 'is', 'racist', 'Mind', 'blown', '@user', 'on', 'stage', 'at', '#flzjingleball', 'at', 'the', '@user', 'in', '#Tampa', '#iheartradio', 'You', 'know', "it's", 'going', 'to', 'be', 'a', 'great', 'day', 'when', "you're", 'Garmin', 'resets', 'itself', 'and', 'you', 'spill

In [None]:
print(f"""
Amount of same tokens: {same_c}
Unique tokens in our tokenizer: {uniq_our}
Unique tokens in nltk tokenizer: {uniq_nltk}
Amount of tokens not present in either tokenizer: {not_present}
Length of full irony text: {len(irony)}""")


Amount of same tokens: 37691
Unique tokens in our tokenizer: 1253
Unique tokens in nltk tokenizer: 7331
Amount of tokens not present in either tokenizer: 345
Length of full irony text: 229674


In [None]:
d2 = difflib.Differ()
result2 = list(d.compare(tokens, nltk_irony))

same_lst2 = [lin2 for lin2 in result2 if lin2.startswith(" ")]
nltk_lst2 = [lin2 for lin2 in result2 if lin2.startswith("+")]
our_lst2 = [lin2 for lin2 in result2 if lin2.startswith("-")]
not_lst2 = [lin2 for lin2 in result2 if lin2.startswith("?")]

same_c2 = len(same_lst2)
uniq_nltk2 = len(nltk_lst2)
uniq_our2 = len(our_lst2)
not_present2 = len(not_lst2)

In [None]:
print(f"""
Amount of same tokens: {same_c2}
Unique tokens in our tokenizer: {uniq_our2}
Unique tokens in nltk tokenizer: {uniq_nltk2}
Amount of tokens not present in either tokenizer: {not_present2}
Length of full irony text: {len(irony)}""")


Amount of same tokens: 39913
Unique tokens in our tokenizer: 2092
Unique tokens in nltk tokenizer: 5109
Amount of tokens not present in either tokenizer: 639
Length of full irony text: 229674


In [None]:
test2 = r".+"
test2_re = re.compile(test2)
re.findall(test2_re, irony)
re.findall(r"(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)", irony)
re.findall(r"[\U0000263a-\U000e007f]", irony)
re.findall(r"(\w+\S\w+|\w+[\/']?|[@#]?\w+)", irony)

['seeing',
 'ppl',
 'walking',
 'w/',
 'crutches',
 'makes',
 'me',
 'really',
 'excited',
 'for',
 'the',
 'next',
 '3',
 'weeks',
 'of',
 'my',
 'life',
 'look',
 'for',
 'the',
 'girl',
 'with',
 'the',
 'broken',
 'smile',
 'ask',
 'her',
 'if',
 'she',
 'wants',
 'to',
 'stay',
 'while',
 'and',
 'she',
 'will',
 'be',
 'loved',
 'Now',
 'I',
 'remember',
 'why',
 'I',
 'buy',
 'books',
 'online',
 '@user',
 '#servicewithasmile',
 '@user',
 '@user',
 'So',
 'is',
 'he',
 'banded',
 'from',
 'wearing',
 'the',
 'clothes',
 '#Karma',
 'Just',
 'found',
 'out',
 'there',
 'are',
 'Etch',
 'A',
 'Sketch',
 'apps',
 '#oldschool',
 '#notoldschool',
 'Hey',
 'what',
 'do',
 'you',
 'know',
 'one',
 'of',
 'the',
 'witnesses',
 'supporting',
 'Darren',
 "Wilson's",
 'story',
 'lied',
 'And',
 'is',
 'racist',
 'Mind',
 'blown',
 '@user',
 'on',
 'stage',
 'at',
 '#flzjingleball',
 'at',
 'the',
 '@user',
 'in',
 '#Tampa',
 '#iheartradio',
 'You',
 'know',
 "it's",
 'going',
 'to',
 'be',


In [None]:
print(result)

['  seeing', '  ppl', '  walking', '- w/', '+ w', '+ /', '  crutches', '  makes', '  me', '  really', '  excited', '  for', '  the', '  next', '  3', '  weeks', '  of', '  my', '  life', '  look', '  for', '  the', '  girl', '  with', '  the', '  broken', '  smile', '+ ,', '  ask', '  her', '  if', '  she', '  wants', '  to', '  stay', '  while', '+ ,', '  and', '  she', '  will', '  be', '  loved', '+ .', '  üíï', '  üéµ', '  Now', '  I', '  remember', '  why', '  I', '  buy', '  books', '  online', '  @user', '  #servicewithasmile', '  @user', '  @user', '  So', '  is', '  he', '  banded', '  from', '  wearing', '  the', '  clothes', '+ ?', '  #Karma', '  Just', '  found', '  out', '  there', '  are', '  Etch', '  A', '  Sketch', '  apps', '+ .', '  #oldschool', '  #notoldschool', '  Hey', '  what', '  do', '  you', '  know', '+ ,', '  one', '  of', '  the', '  witnesses', '  supporting', '- Darren Wilson', '- s', '+ Darren', "+ Wilson's", '  story', '  lied', '+ !', '  And', '  is

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e4cdc3a5-dd4a-4d72-a71a-972cea883107' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>