In [4]:
import pandas as pd
import spacy
import re
import pickle

In [5]:
# Ознакомимся с нашим набором данных
df = pd.read_csv('../data/text.csv')

Each entry in this dataset consists of a text segment representing a Twitter message and a corresponding label indicating the predominant emotion conveyed. The emotions are classified into six categories: sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5). Whether you're interested in sentiment analysis, emotion classification, or text mining, this dataset provides a rich foundation for exploring the nuanced emotional landscape within the realm of social media.

In [6]:
# проверим соотношение классов
df['label'].value_counts() / len(df['label']) * 100

label
1    33.844519
0    29.074948
3    13.751383
4    11.446970
2     8.290128
5     3.592053
Name: count, dtype: float64

In [7]:
df = df.drop(['Unnamed: 0'], axis=1)

In [8]:
# мой сет с которым я буду работать
df

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4
...,...,...
416804,i feel like telling these horny devils to find...,2
416805,i began to realize that when i was feeling agi...,3
416806,i feel very curious be why previous early dawn...,5
416807,i feel that becuase of the tyranical nature of...,3


In [9]:
# проверим наличие верхнего регистра в тексте
mask_uppercase = df['text'].str.isupper()
df[mask_uppercase]

Unnamed: 0,text,label


In [10]:
# проверим наличие упоминаний пользователей. Сначала создам функцию
def check_user_mentions(text):
    pattern = r'@(\w+)'
    mentions = re.findall(pattern, text)
    return mentions

In [11]:
mentions = []
for twit in df['text']:
    if len(check_user_mentions(twit)) > 0:
        mentions.append(twit)
        
print(mentions)

[]


In [12]:
# Исправляю сокращённые формы, которые могут быть неправильно обработаны токенизатором и лемматизатором spaCy
def expand_contractions(text):
    contractions = {
        "im": "i am",
        "i m": "i am",
        "i ll": "i will",
        "i ve": "i have",
        "ive": "i have",
        "i d": "i would",
        "id": "i would",
        "youre": "you are",
        "you re": "you are",
        "youll": "you will",
        "you ll": "you will",
        "youve": "you have",
        "you ve": "you have",
        "youd": "you would",
        "you d": "you would",
        "hes": "he is",
        "he s": "he is",
        "he ll": "he will",
        "he d": "he would",
        "hed": "he would",
        "shes": "she is",
        "she s": "she is",
        "she ll": "she will",
        "shell": "she will",
        "she d": "she would",
        "shed": "she would",
        "it s": "it is",
        "it d": "it would",
        "itd": "it would",
        "we re": "we are",
        "we ll": "we will",
        "we ve": "we have",
        "weve": "we have",
        "we d": "we would",
        "wed": "we would",
        "they re": "they are",
        "theyre": "they are",
        "theyll": "they will",
        "they ll": "they will",
        "they ve": "they have",
        "theyve": "they have",
        "theyd": "they would",
        "they d": "they would",
        "don t": "do not",
        "dont": 'do not',
        "doesn t": "does not",
        "didn t": "did not",
        "didnt": "did not",
        "haven t": "have not",
        "hasn t": "has not",
        "hadn t t": "had not",
        "wouldn t": "would not",
        "won t": "will not",
        "wont": "will not",
        "can t": "can not",
        "cant": "can not",
        "couldn t t": "could not",
        "couldnt": "could not",
        "shouldn t": "should not",
        "shouldnt": "should not",
        "isn t": "is not",
        "isnt": "is not",
        "weren t": "were not",
        "werent": "were not",
        "wasn t": "was not",
        "wasnt": "was not",
        "aren t": "are not",
        "arent": "are not",
        "woulndnt t ve": "would not have",
        "woulndnttve": "would not have",
        "shoulndnt t ve": "should not have",
        "shoulndnttve": "should not have"
    }

    for contraction, expansion in contractions.items():
        text = re.sub(r'\b' + re.escape(contraction) + r'\b', expansion, text)

    return text

In [13]:
# Применяю функцию. Сохраняю результат в тот же датасет, заменив оригинальные данные обработанными
df['text'] = df['text'].apply(expand_contractions)

In [14]:
# для удаления стоп-слов и токенизации буду использовать пакет SpaCy
nlp = spacy.load("en_core_web_sm")

In [15]:
# определяю функцию, которая будет возвращать леммы без стоп-слов
def process_text_with_spacy(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_stop]

In [16]:
# вызываю написанную выше функцию и получаю список списков с обработанным текстом
tokenized_text = []

for text in df['text']:
    token = process_text_with_spacy(text)
    tokenized_text.append(token)

In [17]:
# Сохранение списка в файл
with open('../data/tokenized_text.pkl', 'wb') as f:
    pickle.dump(tokenized_text, f)