## Введение в обработку естественного языка

### Урок 1. Предобработка текста

Осуществим предобработку данных с Твиттера, чтобы отчищенный данные в дальнейшем использовать для задачи классификации. Данный датасет содержит негативные (label = 1) и нейтральные (label = 0) высказывания.
Для работы объединим train_df и test_df.

In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)
import os
import pickle

In [2]:
apostrophe_dict = {
"ain't": "am not / are not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is",
"i'd": "I had / I would",
"i'd've": "I would have",
"i'll": "I shall / I will",
"i'll've": "I shall have / I will have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

short_word_dict = {
"121": "one to one",
"a/s/l": "age, sex, location",
"adn": "any day now",
"afaik": "as far as I know",
"afk": "away from keyboard",
"aight": "alright",
"alol": "actually laughing out loud",
"b4": "before",
"b4n": "bye for now",
"bak": "back at the keyboard",
"bf": "boyfriend",
"bff": "best friends forever",
"bfn": "bye for now",
"bg": "big grin",
"bta": "but then again",
"btw": "by the way",
"cid": "crying in disgrace",
"cnp": "continued in my next post",
"cp": "chat post",
"cu": "see you",
"cul": "see you later",
"cul8r": "see you later",
"cya": "bye",
"cyo": "see you online",
"dbau": "doing business as usual",
"fud": "fear, uncertainty, and doubt",
"fwiw": "for what it's worth",
"fyi": "for your information",
"g": "grin",
"g2g": "got to go",
"ga": "go ahead",
"gal": "get a life",
"gf": "girlfriend",
"gfn": "gone for now",
"gmbo": "giggling my butt off",
"gmta": "great minds think alike",
"h8": "hate",
"hagn": "have a good night",
"hdop": "help delete online predators",
"hhis": "hanging head in shame",
"iac": "in any case",
"ianal": "I am not a lawyer",
"ic": "I see",
"idk": "I don't know",
"imao": "in my arrogant opinion",
"imnsho": "in my not so humble opinion",
"imo": "in my opinion",
"iow": "in other words",
"ipn": "I’m posting naked",
"irl": "in real life",
"jk": "just kidding",
"l8r": "later",
"ld": "later, dude",
"ldr": "long distance relationship",
"llta": "lots and lots of thunderous applause",
"lmao": "laugh my ass off",
"lmirl": "let's meet in real life",
"lol": "laugh out loud",
"ltr": "longterm relationship",
"lulab": "love you like a brother",
"lulas": "love you like a sister",
"luv": "love",
"m/f": "male or female",
"m8": "mate",
"milf": "mother I would like to fuck",
"oll": "online love",
"omg": "oh my god",
"otoh": "on the other hand",
"pir": "parent in room",
"ppl": "people",
"r": "are",
"rofl": "roll on the floor laughing",
"rpg": "role playing games",
"ru": "are you",
"shid": "slaps head in disgust",
"somy": "sick of me yet",
"sot": "short of time",
"thanx": "thanks",
"thx": "thanks",
"ttyl": "talk to you later",
"u": "you",
"ur": "you are",
"uw": "you’re welcome",
"wb": "welcome back",
"wfm": "works for me",
"wibni": "wouldn't it be nice if",
"wtf": "what the fuck",
"wtg": "way to go",
"wtgp": "want to go private",
"ym": "young man",
"gr8": "great"
}


emoticon_dict = {
":)": "happy",
":‑)": "happy",
":-]": "happy",
":-3": "happy",
":->": "happy",
"8-)": "happy",
":-}": "happy",
":o)": "happy",
":c)": "happy",
":^)": "happy",
"=]": "happy",
"=)": "happy",
"<3": "happy",
":-(": "sad",
":(": "sad",
":c": "sad",
":<": "sad",
":[": "sad",
">:[": "sad",
":{": "sad",
">:(": "sad",
":-c": "sad",
":-< ": "sad",
":-[": "sad",
":-||": "sad"
}

In [3]:
train_df = pd.read_csv('train_tweets.csv')
train_df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
test_df = pd.read_csv('test_tweets.csv')
test_df.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [5]:
combine_df = pd.concat([train_df, test_df], ignore_index=True)

In [6]:
print(combine_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49159 entries, 0 to 49158
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      49159 non-null  int64  
 1   label   31962 non-null  float64
 2   tweet   49159 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 1.1+ MB
None


___

1. Удалим @user из всех твитов с помощью паттерна "@[\w]*". Для этого создадим функцию: 
 - для того, чтобы найти все вхождения паттерна в тексте, необходимо использовать re.findall(pattern, input_txt)
 - для для замены @user на пробел, необходимо использовать re.sub()


In [7]:
combine_df['tweet'] = combine_df['tweet'].apply(lambda x: re.sub("@[\w]*", '', x))

In [8]:
combine_df.head(5)

Unnamed: 0,id,label,tweet
0,1,0.0,when a father is dysfunctional and is so sel...
1,2,0.0,thanks for #lyft credit i can't use cause th...
2,3,0.0,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation


Все ок

___

2. Изменим регистр твитов на нижний с помощью .lower()

In [9]:
combine_df['tweet'] = combine_df['tweet'].apply(str.lower)

In [10]:
combine_df.sample(10)

Unnamed: 0,id,label,tweet
16808,16809,1.0,#aleppo #orlando #syria #trump #war #berlin...
23207,23208,0.0,"what a shame to see after 16h of training, the..."
41677,41678,,henry is #allout.
28125,28126,0.0,their lives were taken because they were gay!
33994,33995,,#amarinder destroyed #sugarcane industry in #p...
45470,45471,,beautiful beach #onlyindominicanrepublic #blog...
14529,14530,0.0,#wild #iris - #nature #photography quality...
7208,7209,0.0,"achievement unlocked! finally, played on stein..."
3399,3400,1.0,"in the #gop #deplorable i have seen many ""con..."
22499,22500,0.0,question is am i more upset about not being ab...


Ок

___

3. Заменим сокращения с апострофами (пример: ain't, can't) на пробел, используя apostrophe_dict. Для этого необходимо сделать функцию: для каждого слова в тексте проверить (for word in text.split()), если слово есть в словаре apostrophe_dict в качестве ключа (сокращенного слова), то заменить ключ на значение (полную версию слова).

In [11]:
def words_change(row, change_dict):
    return ' '.join([change_dict.get(word, word) for word in row['tweet'].split()])

In [12]:
combine_df['tweet'] = combine_df.apply(lambda x: words_change(x, apostrophe_dict), axis=1)

In [13]:
combine_df.sample(10)

Unnamed: 0,id,label,tweet
23024,23025,0.0,so excited for tonight's 'making it happen' aw...
3803,3804,0.0,your desire for miscegenation genocide is your...
34058,34059,,yea he is
33979,33980,,ryderrideu - see you c2c and cum together: #sn...
46084,46085,,can #lighttherapy help with or #depression? #a...
33610,33611,,i am so to have this! this makes in #float #pe...
48745,48746,,"check out this new trending #funny #gif ! , 30..."
8495,8496,0.0,i love summer and grillin' #grillingseason #ha...
47139,47140,,"i am performing this friday 17th june at 9pm, ..."
7000,7001,0.0,waiting in the dark for my first film of which...


Все хорошо

____

4. Заменим сокращения на их полные формы, используя short_word_dict. Для этого воспользуемся функцией, используемой в предыдущем пункте.

In [14]:
combine_df['tweet'] = combine_df.apply(lambda x: words_change(x, short_word_dict), axis=1)

In [15]:
combine_df.sample(10)

Unnamed: 0,id,label,tweet
27137,27138,0.0,"â #gbp/cad resumes decline, hits fresh 1-mon..."
9674,9675,0.0,"you will not be punished for your anger, you w..."
13842,13843,0.0,13.6.15 #brã¼ssel #13615#onedirection #oneyear...
759,760,0.0,just bought: 'launch: using design thinking......
48567,48568,,it has / it is all about sharing #love and mom...
11043,11044,0.0,im passed the point where i need astronomical ...
39041,39042,,340+ ev for hrc. you shall / you will have a l...
20995,20996,0.0,"but for realz, where are those jersey pre-orde..."
14607,14608,0.0,i need this book asap! wow! #foodporn #vegan #...
32697,32698,,brought out the aluminum equipment!!! #collisi...


Done

5. Заменим эмотиконы (пример: ":)" = "happy") на пробелы, используя emoticon_dict. Для этого воспользуемся функцией, используемой в предыдущем пункте.

In [16]:
combine_df['tweet'] = combine_df.apply(lambda x: words_change(x, emoticon_dict), axis=1)

In [17]:
combine_df.sample(10)

Unnamed: 0,id,label,tweet
17380,17381,0.0,"#don't #worry, #be by #kaye #menner #photograp..."
25422,25423,0.0,sad when someone cannot love because they were...
25354,25355,0.0,used to be one of 4 #movie #theaters in #dabro...
27496,27497,0.0,scout meetings are sometimes some of the best ...
23428,23429,0.0,there is a mona lisa style to this man's face ...
13049,13050,0.0,good morning peeps! #buendia #calor #dominican...
32604,32605,,here's a flower for a rainy day ð¸ #london #...
42634,42635,,channel in our inner #goddess with this #lunch...
23764,23765,0.0,you are welcome already introduced my paner to...
23988,23989,0.0,the #fbi #corrupt #doj are #hillaryclinton &am...


Смайликов не осталось

____

6. Заменим пунктуацию на пробелы, используя re.sub() и паттерн r'[^\w\s]'

In [18]:
combine_df['tweet'] = combine_df['tweet'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))

In [19]:
combine_df.sample(10)

Unnamed: 0,id,label,tweet
37550,37551,,i work my ass off for my shopping addictions
1308,1309,0.0,kanami amp sorana ð â â ï â ð thank...
37957,37958,,more sad news mr hockey gordiehowe died 25 ...
9977,9978,0.0,I am glad lebron finally got what he always wa...
28225,28226,0.0,workers are good workers learn how to motiv...
30993,30994,0.0,i am thankful for knowledge thankful positive
14043,14044,0.0,i am thankful for my best friends thankful ...
33955,33956,,i love summer â ï nofilter nonmakeup su...
34296,34297,,a amp e cancels controversial unscripted serie...
27558,27559,0.0,scheduled delivery is now 23 hrs late no phon...


Знаков препинания не осталось

___

7. Заменим спец. символы на пробелы, используя re.sub() и паттерн r'[^a-zA-Z0-9]'

In [20]:
combine_df['tweet'] = combine_df['tweet'].apply(lambda x: re.sub(r'[^a-zA-Z0-9]', ' ', x))

In [21]:
combine_df.sample(10)

Unnamed: 0,id,label,tweet
12225,12226,0.0,tomhiddleston s day
17891,17892,0.0,nationalbestfriendsday love you jessicalombar...
38702,38703,,i just got to know about orlando but surely ...
19814,19815,0.0,going to see the stone roses next week
27236,27237,0.0,dreams of home happy dream home
31365,31366,1.0,you okay with this of course you are trai...
30026,30027,0.0,better get your fix before heading to brunsco...
1710,1711,0.0,i am orlando sad orlando rip
47824,47825,,izzybelle at the lake doubledoodle doodlesof...
18830,18831,0.0,national bestfriend day is just another day wh...


Остались только буквы и цифры.

___

8. Заменим числа на пробелы, используя re.sub() и паттерн r'[^a-zA-Z]'

In [22]:
combine_df['tweet'] = combine_df['tweet'].apply(lambda x: re.sub(r'[^a-zA-Z]', ' ', x))

In [23]:
combine_df.sample(10)

Unnamed: 0,id,label,tweet
35544,35545,,ericfriday comparing a political career to h...
29285,29286,0.0,fridayyyy w minkie friday fri...
31592,31593,0.0,my guests always have fun kayaking amp on...
15320,15321,0.0,traveling bull up you will dominate your bul...
11628,11629,0.0,fathers day aldubebfathersday
35301,35302,,very impoant
24076,24077,0.0,very to announce our new blog read the firs...
26388,26389,0.0,you know you are deep when you try to search ...
9447,9448,0.0,feels bad when you can t explains to people ...
43130,43131,,sad lime is sad lime redbubble stickers cu...


Остались только буквы

___

9. Удалим из текста слова длиной в 1 символ, используя ' '.join([w for w in x.split() if len(w)>1])

In [24]:
combine_df['tweet'] = combine_df['tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>1]))

In [25]:
combine_df.sample(10)

Unnamed: 0,id,label,tweet
24423,24424,0.0,model love you take with you all the time in ur
6068,6069,0.0,dreamt of you tonight but when finally got to ...
40405,40406,,thank you so much more bihday day bihday to me...
11536,11537,0.0,work work work work work humpday goodmorning w...
19942,19943,0.0,got shot at her own conce rip used to always w...
26781,26782,0.0,pm class and am already here in the classroom ...
8464,8465,0.0,we are over here making plans to go to raging ...
44949,44950,,you realize you are responding to fake picture...
24897,24898,0.0,check out this shi just cant stop smiling let ...
29923,29924,0.0,am celebrated am positive affirmation


Убрали

___

10. Поделим твиты на токены с помощью nltk.tokenize.word_tokenize, создав новый столбец 'tweet_token'.

In [26]:
combine_df['tweet_token'] = combine_df['tweet'].apply(lambda x: nltk.tokenize.word_tokenize(x))

In [27]:
combine_df.sample(3)

Unnamed: 0,id,label,tweet,tweet_token
28251,28252,0.0,amazing health benefits of cucumbers healthy i...,"[amazing, health, benefits, of, cucumbers, hea..."
38928,38929,,mindsconsole make mistake that has that is lif...,"[mindsconsole, make, mistake, that, has, that,..."
13063,13064,0.0,national best friend day,"[national, best, friend, day]"


Получилось

____

11. Удалим стоп-слова из токенов, используя nltk.corpus.stopwords. Создадим столбец 'tweet_token_filtered' без стоп-слов.

In [28]:
stop_words = nltk.corpus.stopwords.words('english')

In [29]:
def del_stopwords(tokens, stop_words=stop_words):
    return [word for word in tokens if word not in stop_words]

In [30]:
combine_df['tweet_token_filtered'] = combine_df['tweet_token'].apply(lambda x: del_stopwords(x))

In [31]:
combine_df.head(3)

Unnamed: 0,id,label,tweet,tweet_token,tweet_token_filtered
0,1,0.0,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ..."
1,2,0.0,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee..."
2,3,0.0,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]"


Удалили

___

12. Применим стемминг к токенам с помощью nltk.stem.PorterStemmer. Создадим столбец 'tweet_stemmed' после применения стемминга.

In [32]:
stemmer = nltk.stem.PorterStemmer()

In [33]:
def custom_stemming(tokens, stemmer=stemmer):
    return [stemmer.stem(word) for word in tokens]

In [34]:
combine_df['tweet_stemmed'] = combine_df['tweet_token_filtered'].apply(lambda x: custom_stemming(x))

In [35]:
combine_df.sample(3)

Unnamed: 0,id,label,tweet,tweet_token,tweet_token_filtered,tweet_stemmed
48646,48647,,update social analytics beautiful travel motogp,"[update, social, analytics, beautiful, travel,...","[update, social, analytics, beautiful, travel,...","[updat, social, analyt, beauti, travel, motogp]"
3726,3727,0.0,amateur happy redhead happy redhead redhead pr...,"[amateur, happy, redhead, happy, redhead, redh...","[amateur, happy, redhead, happy, redhead, redh...","[amateur, happi, redhead, happi, redhead, redh..."
4078,4079,0.0,it cool though at this rate nintendo will be d...,"[it, cool, though, at, this, rate, nintendo, w...","[cool, though, rate, nintendo, done, years, ge...","[cool, though, rate, nintendo, done, year, get..."


Ок, получилось

___

13. Применим лемматизацию к токенам с помощью nltk.stem.wordnet.WordNetLemmatizer. Создадим столбец 'tweet_lemmatized' после применения лемматизации.

In [36]:
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

In [37]:
def custom_lemmatizer(tokens, lemmatizer=lemmatizer):
    return [lemmatizer.lemmatize(word) for word in tokens]

In [38]:
combine_df['tweet_lemmatized'] = combine_df['tweet_token_filtered'].apply(lambda x: custom_lemmatizer(x))

In [39]:
combine_df.sample(10)

Unnamed: 0,id,label,tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
12054,12055,0.0,sun sets and see the wonder of the night posit...,"[sun, sets, and, see, the, wonder, of, the, ni...","[sun, sets, see, wonder, night, positivity, ap...","[sun, set, see, wonder, night, posit, appreci,...","[sun, set, see, wonder, night, positivity, app..."
18021,18022,0.0,we look forward to seeing more adventures,"[we, look, forward, to, seeing, more, adventures]","[look, forward, seeing, adventures]","[look, forward, see, adventur]","[look, forward, seeing, adventure]"
6074,6075,1.0,never be content to sit back and watch as othe...,"[never, be, content, to, sit, back, and, watch...","[never, content, sit, back, watch, others, rig...","[never, content, sit, back, watch, other, righ...","[never, content, sit, back, watch, others, rig..."
12165,12166,0.0,life is what you make of it amp choose make mi...,"[life, is, what, you, make, of, it, amp, choos...","[life, make, amp, choose, make, mine, happy, o...","[life, make, amp, choos, make, mine, happi, on...","[life, make, amp, choose, make, mine, happy, o..."
37382,37383,,that ll be right damn freeloading refugee,"[that, ll, be, right, damn, freeloading, refugee]","[right, damn, freeloading, refugee]","[right, damn, freeload, refuge]","[right, damn, freeloading, refugee]"
44036,44037,,that conference was great,"[that, conference, was, great]","[conference, great]","[confer, great]","[conference, great]"
22238,22239,0.0,smile with confidence at school backtoschool s...,"[smile, with, confidence, at, school, backtosc...","[smile, confidence, school, backtoschool, smil...","[smile, confid, school, backtoschool, smile, s...","[smile, confidence, school, backtoschool, smil..."
48353,48354,,patrons review us on zomato firewoodbiryani,"[patrons, review, us, on, zomato, firewoodbiry...","[patrons, review, us, zomato, firewoodbiryani]","[patron, review, us, zomato, firewoodbiryani]","[patron, review, u, zomato, firewoodbiryani]"
44100,44101,,allbarzab aint got time prod by np on soundclo...,"[allbarzab, aint, got, time, prod, by, np, on,...","[allbarzab, aint, got, time, prod, np, soundcl...","[allbarzab, aint, got, time, prod, np, soundcl...","[allbarzab, aint, got, time, prod, np, soundcl..."
19246,19247,0.0,saw miller lite beer,"[saw, miller, lite, beer]","[saw, miller, lite, beer]","[saw, miller, lite, beer]","[saw, miller, lite, beer]"


Получилось

14. Сохраним результат предобработки в pickle-файл.

In [40]:
combine_df.to_pickle('prepared_data.pkl')