# Добро пожаловать!

Это файл для анализа данных, собранных при помощи файла TikTok Scraping.

Для начала работы запустите ячейки в разделе "Соединение", а затем перейдите к ячейкам нужного блока.

Чтобы не перегружать файл, на каждом этапе мы будем выводить в качестве примера по пять элементов результата.

# Соединение

Логика работы кода: 

1. Создай датафрейм для всех unique_characteristics
2. Для каждой подпапки в текущей директории
3. Возьми файл unique_characteristics
4. Добавь данные к общему датафрейму
5. Сохрани датафрейм как файл

In [1]:
# Импортируем нужные библиотеки и инициализируем функцию для лемматизации текста

import os
import re
import csv
import pandas as pd
from collections import Counter
from stop_words import get_stop_words

from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
import pymorphy2 as pm

def lemmatize_nltk(fname, text):
    """
    fname: название файла, куда будет записан результат лемматизации
    text: текст для анализа
    """
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    wl = WordNetLemmatizer()
    
    with open (fname, 'w', encoding='utf-8') as outfile:
        for i in range(len(tagged)):
            if tagged[i][1].startswith('N'):
                outfile.write(wl.lemmatize(tagged[i][0], pos='n') + ' ')
            elif tagged[i][1].startswith('V') or tagged[i][1].startswith('M'):
                outfile.write(wl.lemmatize(tagged[i][0], pos='v') + ' ')
            else:
                outfile.write(tagged[i][0] + ' ')

# Выведем список всех папок в текущей директории, из которых мы будем подтягивать данные 
                
folders_list_all = os.scandir (os.getcwd())
folders_list_dir = []
for i in folders_list_all:
    if i.is_dir() == True:
        folders_list_dir.append (i.name)
folders_list_dir

['.ipynb_checkpoints',
 'americans',
 'balkan',
 'baltic',
 'chamoney1',
 'easterneuropean',
 'easterneuropeancheck',
 'helvetica12',
 'katteryyna',
 'katteryyna_stitch',
 'miadio',
 'monica_zielinski',
 'nikiproshin',
 'okaykali',
 'postsoviet',
 'SepVideos',
 'slavic',
 'straightouttarussia',
 'teameffujoe',
 'TellMeYouNotAmerican',
 'therussianmatreshka',
 'torryhermann',
 'ugneexo',
 'voidable',
 'voidable2',
 'webkinpoodel']

# Для анализа подписей, стикеров и распознанной речи

In [2]:
# Посмотрим на отдельные файлы с уникальными характеристиками, которые нам предстоит объединить в одну таблицу

path = os.getcwd()
folders_subfiles = []
for i in folders_list_dir:
    folders_subfiles.append (os.listdir (os.path.join (path, i)))
all_csv = []
for i in folders_subfiles:
    for j in i:
        if j[-26:] == 'unique_characteristics.csv':
            all_csv.append (j)
all_csv

['americans_unique_characteristics.csv',
 'balkan_unique_characteristics.csv',
 'baltic_unique_characteristics.csv',
 'chamoney1_unique_characteristics.csv',
 'easterneuropean_unique_characteristics.csv',
 'easterneuropeancheck_unique_characteristics.csv',
 'helvetica12_unique_characteristics.csv',
 'katteryyna_unique_characteristics.csv',
 'katteryyna_unique_characteristics.csv',
 'miadio_unique_characteristics.csv',
 'monica_zielinski_unique_characteristics.csv',
 'nikiproshin_unique_characteristics.csv',
 'okaykali_unique_characteristics.csv',
 'postsoviet_unique_characteristics.csv',
 'SepVideos_unique_characteristics.csv',
 'slavic_unique_characteristics.csv',
 'straightouttarussia_unique_characteristics.csv',
 'teameffujoe_unique_characteristics.csv',
 'TellMeYouNotAmerican_unique_characteristics.csv',
 'therussianmatreshka_unique_characteristics.csv',
 'torryhermann_unique_characteristics.csv',
 'ugneexo_unique_characteristics.csv',
 'voidable_unique_characteristics.csv',
 'void

In [3]:
# Объединим все таблицы в единый датафрейм

def get_folders_name (filename):
    x = filename.split ('_unique_characteristics.csv')
    return x[0]

all_csv_df = pd.DataFrame ()
for i in all_csv:
    new_df = pd.read_csv (os.path.join (path, get_folders_name (i), all_csv [all_csv.index(i)])).drop ('Unnamed: 0', axis = 1)
    all_csv_df = all_csv_df.append (new_df)
    all_csv_df.reset_index().drop_duplicates()
all_csv_df = all_csv_df.drop_duplicates()
all_csv_df [:5]

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Unnamed: 0,author_id,author_name,author_nickname,aweme_id,comments,date,description,hashtags,likes,link,music,shares,sticker_text,stitched_original_link,stt_text,timestamp_create_time,views
0,6866876516039623686,dMs dOnT wORk,._viv,6913612689344400642,7918,03.01.2021,#fyp #foryou #lmao #animals #russia #usa #amer...,"['#fyp', '#foryou', '#lmao', '#animals', '#rus...",1200000,https://www.tiktok.com/@._viv/video/6913612689...,Originalton,36300,Americans 🇺🇸: we have the best trained animal...,,no no no no i don't either right,1609700895,6900000
1,313224616116707328,Louis Oro,louisoro,6906426527299161349,6747,15.12.2020,#stitch with @voidable_ inspired by @maaaritz...,"['#tiktokparati', '#bellezalatina', '#carcajad...",738400,https://www.tiktok.com/@louisoro/video/6906426...,original sound,3721,CC:QUÉ ES LO MÁS TONTO QUE UN ESTADOUNIDENSE ...,,i'm calling the curious what is the dumbest t...,1608027941,3100000
2,6800387735624893445,Cody Wayne,codywaynestout,6901458722493664518,8675,02.12.2020,🤦🏼‍♂️ #fyp #fypシ #fight #round2 #americans #cd...,"['#fyp', '#fypシ', '#fight', '#round2', '#ameri...",675000,https://www.tiktok.com/@codywaynestout/video/6...,original sound,94300,The CDC Americans,,we have approved a vaccine for nine it's appr...,1606871108,3500000
3,6532295762217320449,usher🍜,theasianusher,6949923736741088517,10900,11.04.2021,American humour at its finest😂 #fyp #foryou #v...,"['#fyp', '#foryou', '#viral', '#trending', '#b...",643700,https://www.tiktok.com/@theasianusher/video/69...,original sound,4087,,,the deodorant as something that england has y...,1618155220,1900000
4,141043686905278464,Michael Vorpahl,michaelvorpahl,6914793867334323462,16600,06.01.2021,This absolutely blows my mind... #twitter #soc...,"['#twitter', '#social', '#president', '#speech...",558900,https://www.tiktok.com/@michaelvorpahl/video/6...,original sound,46800,,,no is check this out to our beloved still rem...,1609975921,2800000


In [None]:
# Сохраним полученный датафрейм в csv-файл

pd.DataFrame.to_csv (all_csv_df, 'all_unique_characteristics.csv')

# Поиск по уникальным характеристикам

In [None]:
for i in all_csv_df.iterrows():
    try:
        if 'switzerland' in i[1]['hashtags']:
            print (i[1]['author_nickname'])
            print (i[1]['description'])
            print (i[1]['sticker_text'])
            print (i[1]['stt_text'])
    except:
        pass

In [None]:
retroriga = []
for i in all_csv_df.iterrows ():
    try:
        if ('easterneuropeancheck' in i[1]['hashtags']) & ('washingtoncheck' in i[1]['hashtags']):
            print (i[1]['description'])
            print (i[1]['sticker_text'])
            print (i[1]['author_nickname'])
            print (i[1]['aweme_id'])
            print (i[1]['stt_text'])
#             retroriga.append (i[1]['author_nickname'])
    except TypeError:
        pass
# len (retroriga)

# Анализ подписей и стикеров

Запускать после выполнения блока "Для анализа подписей, стикеров и распознанной речи"

In [4]:
# Собираем все подписи и стикеры в единый список

all_desc_n_stickers = []
for i in all_csv_df [['description', 'sticker_text']].iterrows():
    try:
        one_string = i[1]['description'] + ' ' + i[1]['sticker_text']
        all_desc_n_stickers.append (one_string)
    except TypeError:
        pass
all_desc_n_stickers [:5]

['#fyp #foryou #lmao #animals #russia #usa #america #americans #tiktok #dog #bird #putin #pigeon  Americans 🇺🇸: we have the best trained animals  Russia 🇷🇺:',
 '#stitch with @voidable_  inspired by @maaaritza #tiktokparati #bellezalatina #carcajadas #aquiaprendo #americans #wearefamily #animo #recordsdays #fyp  CC:QUÉ ES LO MÁS TONTO QUE UN ESTADOUNIDENSE TE HA DICHO? CC:Soy del norte de México, y hace 2 años estaba en la playa  LOUIS.ORO CC:Yo estaba hablando con un estadunidense, y él pensó que yo era británico  CC:CREO QUE FUE POR MI ACENTO CC:Así que le pregunté qué pensaba sobre los mexicanos CC: Y el dijo algo como esto CC: Los mexicanos no prosperan porque son morenos, chaparros, feos CC:Comen frijoles todos los días y no tienen educación. CC:Así que se tienen que ir a mi país  CC:Y luego yo dije CC:¿Estás demente? CC:Soy mexicano, bro CC:No hay nada malo con ser moreno CC: Y sí,  me gustan los frijoles  CC:Pero hoy cené langosta  CC:Y soy como 10 cm más alto que tú CC:Y por cie

In [None]:
# Сохранение csv-файла с подписями и стикерами

with open ('all_descriptions_and_stickers.csv', 'w', newline = '', encoding = 'utf-8') as all_desc_n_stickers_file:
    writer = csv.writer (all_desc_n_stickers_file, delimiter = ",")
    for i in all_desc_n_stickers:
        print (i)
        writer.writerow ([i])

In [None]:
# Сохранение txt-файла с подписями и стикерами

csv_file = 'all_descriptions_and_stickers.csv'
txt_file = 'all_descriptions_and_stickers.txt'
with open(txt_file, "w", encoding = 'utf-8') as my_output_file:
    with open(csv_file, "r", encoding = 'utf-8') as my_input_file:
        [ my_output_file.write(" ".join(row)+'\n') for row in csv.reader(my_input_file)]
    my_output_file.close()

In [5]:
# Загружаем файл для дальнейшей лемматизации

filename = 'all_descriptions_and_stickers.csv'

f = open (filename,'r',encoding='utf-8')
text = str (f.read())

In [6]:
# Лемматизируем файл

lemmatize_nltk ('TRY_lemma_list.txt', text)

In [7]:
# Вновь открываем файл как список полученных лемм

with open ('TRY_lemma_list.txt', 'r', encoding='utf-8') as f:
    lemmatext = str (f.read()).split ()

In [8]:
# Очищаем список лемм от стоп-слов

stop_words = get_stop_words ('english')
stop_words.extend (['don', 't', 'gon', 'na', 'm', 's', 'part', 'Part', 'I', 'im', 'You', 'you', 'YOU', 'u', 'i', 'It', 'it',
                    'cc', 'CC', 'Cc', 'v', 'stitch', "n't", 'go', 'can', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                    'if', 'If', 'r', 'and', 'And', 'like', 'me', 'Me', 'of', 'Of', 'oh', 'hey', 'a', 'how', 'no', 'are',
                   'in', 'the', 'did', 'do', 'didn', 'ya', 're', 'he', 'or', 'to', 'be'])

clean = []

for lemma in lemmatext:
    if lemma not in stop_words and not re.match('\W+', lemma):
        clean.append(lemma.lower())
        
lemmatext = clean

In [9]:
# Соединяем леммы попарно

pairs = []

for i in range (len(lemmatext)-1):
    if lemmatext[i] != lemmatext[i+1]:
        pair = min (lemmatext[i], lemmatext[i+1]) + '_' + max (lemmatext[i], lemmatext[i+1])
        pairs.append(pair)
pairs [:5]

['foryou_fyp', 'foryou_lmao', 'animal_lmao', 'animal_russia', 'russia_usa']

In [10]:
# Подсчитываем частотность пар

counter_set_pairs = Counter(pairs).most_common()
counter_set_pairs [:5]

[('foryou_fyp', 408),
 ('eastern_european', 358),
 ('foryoupage_fyp', 278),
 ('foryou_foryoupage', 238),
 ('balkan_slavic', 136)]

In [None]:
# Сохраняем полученный результат как csv-файл для дальнейшего преобразования в граф

csv = 'word1,word2,weight' + '\n'

for count in counter_set_pairs:
    if count[1] > 5:
        word1 = count[0].split('_')[0]
        word2 = count[0].split('_')[1]
        weight = str(count[1])
        csv += word1 + ',' + word2 + ',' + weight + '\n'

with open('StickersDescription_ethno_counter_pairs.csv', 'w', encoding = 'utf-8') as f:
    f.write(csv)

# Анализ распознанной речи

Запускать после выполнения блока "Для анализа подписей, стикеров и распознанной речи"

In [11]:
# Собираем всю распознанную речь в единый список

all_stt = []
for i in all_csv_df ['stt_text']:
    try:
        one_string = i + '.'
        all_stt.append (one_string)
    except TypeError:
        pass
all_stt [:5]

[" no no no no i don't either right.",
 " i'm calling the curious what is the dumbest thing in american is if i said to you mm well i am from nothing mexico and she's ago i was in the pitch i was talking to an american in he thought i was british i think it does because of my accent so i asked him what he thinks about mexicans and he said something like this way well mexicans done priest or because they are dark short ugly it being everyday and they have medication so they have to go to my country and then i set are you meant so i am mexican bro this nothing wrong with being dark and yet i like beans that today they had lots tested in and i am about four inches tell the dingy and by the way i think i a handsome than you .",
 " we have approved a vaccine for nine it's approved by scientists if you're just trying to turn isn't just zombies i say why it's my choice will just make it mandatory  yeah.",
 " the deodorant as something that england has yet to discover i bugs bunny way he got a

In [None]:
# Поиск по распознанной речи

for i in all_stt:
    if ('american' in i) & ('recession' in i):
        print (i)

In [None]:
# Сохранение csv-файла с распознанной речью

with open ('all_stt.csv', 'w', newline = '', encoding = 'utf-8') as all_stt_file:
    writer = csv.writer (all_stt_file, delimiter = ",")
    for i in all_stt:        
        writer.writerow ([i])

In [None]:
# Сохранение txt-файла с распознанной речью

csv_file = 'all_stt.csv'
txt_file = 'all_stt.txt'
with open(txt_file, "w", encoding = 'utf-8') as my_output_file:
    with open(csv_file, "r", encoding = 'utf-8') as my_input_file:
        [ my_output_file.write(" ".join(row)+'\n') for row in csv.reader(my_input_file)]
    my_output_file.close()

In [12]:
# Высчитываем частотные пары слов

filename = 'all_stt.csv'

f = open (filename,'r',encoding='utf-8')
text = str (f.read())
lemmatize_nltk ('TRY_lemma_list_stt.txt', text)
with open ('TRY_lemma_list_stt.txt', 'r', encoding='utf-8') as f:
    lemmatext = str (f.read()).split ()

stop_words = get_stop_words ('english')
stop_words.extend (['don', 't', 'gon', 'na', 'm', 's', 'part', 'Part', 'I', 'im', 'You', 'you', 'YOU', 'u', 'i', 'It', 'it',
                    'cc', 'CC', 'Cc', 'v', 'stitch', "n't", 'go', 'can', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                    'if', 'If', 'r', 'and', 'And', 'like', 'me', 'Me', 'of', 'Of', 'oh', 'hey', 'a', 'how', 'no', 'are',
                   'in', 'the', 'did', 'do', 'didn', 'ya', 're', 'he', 'or', 'to', 'be', 'is', 'as', 'that', 'tho', 'though',
                   'also', 'la', 've', 'em', 'ta', 'e', 'el', 'ah', 'didnt', 'dont', 'hi', 'ooh', 'mm', 'huh', 'yeah'])


clean = []

for lemma in lemmatext:
    if lemma not in stop_words and not re.match('\W+', lemma):
        clean.append(lemma)
        
lemmatext = clean

pairs = []

for i in range (len(lemmatext)-1):
    if lemmatext[i] != lemmatext[i+1]:
        pair = min (lemmatext[i], lemmatext[i+1]) + '_' + max (lemmatext[i], lemmatext[i+1])
        pairs.append(pair)

counter_set_pairs = Counter(pairs).most_common()
counter_set_pairs [:5]

[('tell_us', 1051),
 ('actually_tell', 686),
 ('actually_without', 201),
 ('actually_us', 199),
 ('us_without', 158)]

In [13]:
# Объединяем схожие значения

lemmatext = ['american' if x == 'america' else x for x in lemmatext]
lemmatext = ['american' if x == 'americans' else x for x in lemmatext]
lemmatext = ['russian' if x == 'russia' else x for x in lemmatext]
lemmatext = ['russian' if x == 'russians' else x for x in lemmatext]
lemmatext = ['european' if x == 'europeans' else x for x in lemmatext]
lemmatext = ['european' if x == 'europe' else x for x in lemmatext]
lemmatext = ['ukrainian' if x == 'ukraine' else x for x in lemmatext]
lemmatext [:5]

['either', 'right', 'call', 'curious', 'dumbest']

In [14]:
# Фильтруем список частотных пар, оставляя лишь пары с упоминанием этничностей и/ или региональных идентичностей

ethnolist = ['russian', 'russia', 'usa', 'american', 'america','state', 'united', 'states',
             'slav', 'slavs', 'slavic', 'baltic', 'eastern', 'european', 'lithuania', 'lithuanian', 'polish', 'poland', 
             'ukranian', 'eastern', 'balkan', 'postsoviet', 'yugoslav', 'yugoslavia', 'yugoslavian', 'yugo',
             'estonian', 'estonia', 'romanian', 'romania', 'latvian', 'latvia', 'ukraine', 'ukrainian',  
             'czechia', 'czech', 'slovenia', 'slovenian', 'bosnian', 'bosnia', 'herzegovina', 'albania', 'albanian', 
            'montenegro', 'montenegrin', 'kosovo', 'kosovar', 'kosovan', 'serbia', 'serbian', 'serb', 
            'bulgaria', 'bulgarian', 'moldova', 'moldovian', 'belarus', 'belarusian', 'georgia', 'georgian', 
             'kazakh', 'kazakhstan', 'armenia', 'armenian', 'soviet', 'croatia', 'macedonia', 'macedonian', 
             'mexico', 'mexican', 'canada', 'canadian', 'australia', 'australian', 'native', 'britain', 'kingdom']
pairs = []


for i in range (len(lemmatext)-1):
    if (lemmatext[i] in ethnolist) & (lemmatext[i] != lemmatext[i+1]):
        pair = min (lemmatext[i], lemmatext[i+1]) + '_' + max (lemmatext[i], lemmatext[i+1])
        pairs.append(pair)

counter_set_pairs = Counter(pairs).most_common()
counter_set_pairs [:5]

[('eastern_european', 174),
 ('state_united', 70),
 ('american_just', 59),
 ('american_people', 28),
 ('american_versus', 27)]

In [None]:
# Сохраняем результат в csv-файл

csv = 'word1,word2,weight' + '\n'

for count in counter_set_pairs:
    if count[1] > 2:
        word1 = count[0].split('_')[0]
        word2 = count[0].split('_')[1]
        weight = str(count[1])
        csv += word1 + ',' + word2 + ',' + weight + '\n'

with open('Stt_ethno_ETHNICITIES_counter_pairs.csv', 'w') as f:
    f.write(csv)

# Анализ тэгов

In [15]:
# Посмотрим на отдельные файлы с тэгами, которые нам предстоит объединить в одну таблицу

def get_folders_name_tags (filename):
    x = filename.split ('_tags.csv')
    return x[0]

path = os.getcwd()
folders_subfiles = []
for i in folders_list_dir:
    folders_subfiles.append (os.listdir (os.path.join (path, i)))
all_csv = []
for i in folders_subfiles:
    for j in i:
        if j[-8:] == 'tags.csv':
            all_csv.append (j)
all_csv

['americans_tags.csv',
 'balkan_tags.csv',
 'baltic_tags.csv',
 'chamoney1_tags.csv',
 'easterneuropean_tags.csv',
 'easterneuropeancheck_tags.csv',
 'helvetica12_tags.csv',
 'katteryyna_tags.csv',
 'katteryyna_tags.csv',
 'miadio_tags.csv',
 'monica_zielinski_tags.csv',
 'nikiproshin_tags.csv',
 'okaykali_tags.csv',
 'postsoviet_tags.csv',
 'SepVideos_tags.csv',
 'slavic_tags.csv',
 'straightouttarussia_tags.csv',
 'teameffujoe_tags.csv',
 'TellMeYouNotAmerican_tags.csv',
 'therussianmatreshka_tags.csv',
 'torryhermann_tags.csv',
 'ugneexo_tags.csv',
 'voidable_tags.csv',
 'webkinpoodel_tags.csv']

In [16]:
# Объединяем даные в единую таблицу

all_csv_df = pd.DataFrame ()
for i in all_csv:
    new_df = pd.read_csv (os.path.join (path, get_folders_name_tags (i), all_csv [all_csv.index(i)]))#.drop ('Unnamed: 0', axis = 1)
    all_csv_df = all_csv_df.append (new_df)
    all_csv_df.reset_index()
all_csv_df [:5]

Unnamed: 0,original_hashtag,relative_hashtag,source
0,americans,fyp,caption
1,americans,foryou,caption
2,americans,lmao,caption
3,americans,animals,caption
4,americans,russia,caption


In [None]:
# Сохраняем датафрейм как csv-файл

pd.DataFrame.to_csv (all_csv_df, 'all_tags.csv')

In [17]:
# Посмотреть наиболее частотные пары тэгов

all_csv_df.groupby(['original_hashtag','relative_hashtag']).size().sort_values(ascending=False) [:5]

original_hashtag  relative_hashtag
easterneuropean   easterneuropean     1811
                  fyp                 1368
slavic            slavic               898
americans         americans            819
fyp               foryou               738
dtype: int64

In [18]:
# Приводим все значения к нижнему регистру и добавляем частотность пар тэгов, удаляем те пары, где оба значения совпадают 

tags_for_cytoscape = pd.DataFrame({'count' : all_csv_df.groupby(['original_hashtag','relative_hashtag']).size()}).reset_index()
tags_for_cytoscape ['original_hashtag'] = tags_for_cytoscape ['original_hashtag'].str.lower()
tags_for_cytoscape ['relative_hashtag'] = tags_for_cytoscape ['relative_hashtag'].str.lower()
for i in tags_for_cytoscape.iterrows ():
    if i[1]['original_hashtag'] == i[1]['relative_hashtag']:
        tags_for_cytoscape.drop (i[0], inplace = True)
tags_for_cytoscape [:5]

Unnamed: 0,original_hashtag,relative_hashtag,count
0,00s,00svibes,1
1,00s,90s,1
2,00s,postsoviet,1
3,00s,vibes,1
4,00s,агатакристи,1


In [19]:
# Фильтруем датафрейм по частотности и сохраняем как csv-файл

tags_morethan_15 = tags_for_cytoscape [ tags_for_cytoscape['count'] > 15 ]
pd.DataFrame.to_csv (tags_morethan_15, 'tags_morethan_15.csv')
tags_morethan_15 [:5]

Unnamed: 0,original_hashtag,relative_hashtag,count
501,4u,funny,20
558,4u,russianvsamerican,16
582,4u,viral,31
1498,americans,fyp,19
2657,democrats,americans,16


In [20]:
# Смотрим на число уникальных тэгов

len(all_csv_df['relative_hashtag'].unique())

9456

In [None]:
# Сохраняем тэги как gexf-файл

import networkx as nx
tags_to_gexf = all_csv_df.drop ('source', axis = 1)
tags_to_gexf.columns = ['source', 'target']
gexf_graph = nx.from_pandas_edgelist (tags_to_gexf, create_using = nx.MultiGraph ())
nx.write_gexf (gexf_graph, 'all_tags_graph.gexf')
print ('Gexf-файл успешно сохранён')

# Анализ комментариев

In [21]:
# Посмотрим на отдельные файлы с комментариями, которые нам предстоит объединить в одну таблицу

def get_folders_name_comments (filename):
    x = filename.split ('_comments_table.csv')
    return x[0]

path = os.getcwd()
folders_subfiles = []
for i in folders_list_dir:
    folders_subfiles.append (os.listdir (os.path.join (path, i)))
all_csv = []
for i in folders_subfiles:
    for j in i:
        if j[-18:] == 'comments_table.csv':
            all_csv.append (j)
all_csv

['chamoney1_comments_table.csv',
 'helvetica12_comments_table.csv',
 'katteryyna_comments_table.csv',
 'monica_zielinski_comments_table.csv',
 'okaykali_comments_table.csv',
 'SepVideos_comments_table.csv',
 'teameffujoe_comments_table.csv',
 'TellMeYouNotAmerican_comments_table.csv',
 'ugneexo_comments_table.csv',
 'voidable_comments_table.csv',
 'voidable2_comments_table.csv',
 'webkinpoodel_comments_table.csv']

In [22]:
# Объединить все таблицы в единый датафрейм

all_csv_df = pd.DataFrame ()
for i in all_csv:
    try:
        new_df = pd.read_csv (os.path.join (path, get_folders_name_comments (i), all_csv [all_csv.index(i)]), sep=';').drop ('Unnamed: 0', axis = 1)
        all_csv_df = all_csv_df.append (new_df)
        all_csv_df.reset_index()
    except FileNotFoundError:
        pass
all_csv_df = all_csv_df.drop_duplicates()
all_csv_df [:5]

Unnamed: 0,aweme_id,cid,comment_type,create_time,date,digg_count,is_author_digged,label_list,label_text,label_type,...,reply_to_reply_id,status,stick_position,text,text_extra,user,user_buried,user_nickname,user_sec_uid,user_unique_id
0,6935766580089359621,6935892084100907009,,1614888230,04.03.2021,3263,False,,,,...,0.0,1.0,0.0,вопрос то в другом яблочный спас когда?,[],"{'nickname': 'фалафель', 'unique_id': 'felevel...",False,фалафель,MS4wLjABAAAAqbCfA702eLQZWdfmozv2kT1hOcuIjaVUYx...,felevel
1,6935766580089359621,6935899153248124929,,1614889877,04.03.2021,1543,False,,,,...,0.0,1.0,0.0,"КОГДА БЫЛО КРЕЩЕНИЕ РУСИ, А????",[],"{'nickname': 'Берели', 'unique_id': 'mybeleri'...",False,Берели,MS4wLjABAAAA-6yrXI7_nkgLRzwNkeRINqtskKxwEvnmtj...,mybeleri
2,6935766580089359621,6935873418386784257,,1614883876,04.03.2021,2068,False,,,,...,0.0,1.0,0.0,дата ледового побоища...,[],"{'nickname': 'ArinessA', 'unique_id': 'aderin_...",False,ArinessA,MS4wLjABAAAAT88x2sKkWDRlZxizL315wyvIxlaK1o3CqV...,aderin_arr
3,6935766580089359621,6935997455586869250,,1614912765,05.03.2021,1089,False,,,,...,0.0,1.0,0.0,"интересно чё с ним будет, когда узнает, что мы...",[],"{'nickname': 'хипхопотам', 'unique_id': 'hiph0...",False,хипхопотам,MS4wLjABAAAAdU9OtwQGEREb6DNtyNhMwWEBMIraacHxsX...,hiph0p0p0tamus
4,6935766580089359621,6936754794384752641,,1615089088,07.03.2021,1001,False,,,,...,0.0,1.0,0.0,"Почему то американцы считают, что все люди в м...",[],"{'nickname': 'diankane', 'unique_id': 'diankan...",False,diankane,MS4wLjABAAAAbgzqiTODtetF9BBXazySg3ec3GvF4VIPol...,diankane


In [23]:
# Объединяем все тексты комментариев в единый список

all_comments = []
for i in all_csv_df ['text']:
    try:
        all_comments.append (i.lower())
    except TypeError:
        pass
all_comments [:5]

['вопрос то в другом яблочный спас когда?',
 'когда было крещение руси, а????',
 'дата ледового побоища...',
 'интересно чё с ним будет, когда узнает, что мы в курсе и когда у них день благодарения',
 'почему то американцы считают, что все люди в мире должны знать их культуру и обычаи, когда они сами не знают о внешнем мире ничего... буквально.']

In [None]:
# Сохраняем все комментарии как csv-файл

with open ('all_comments.csv', 'w', newline = '', encoding = 'utf-8') as all_comments_file:
    writer = csv.writer (all_comments_file, delimiter = ",")
    for i in all_comments:
        print (i)
        writer.writerow ([i])

In [None]:
# Сохраняем все файлы как txt-файл

csv_file = 'all_comments.csv'
txt_file = 'all_comments.txt'
with open(txt_file, "w", encoding = 'utf-8') as my_output_file:
    with open(csv_file, "r", encoding = 'utf-8') as my_input_file:
        [ my_output_file.write(" ".join(row)+'\n') for row in csv.reader(my_input_file)]
    my_output_file.close()

In [24]:
# Загружаем файл и лемматизируем его, очищаем список лемм от стоп-слов

filename = 'all_comments.csv'

f = open (filename,'r',encoding='utf-8')
text = str (f.read())

lemmatize_nltk ('TRY_lemma_list.txt', text)

with open ('TRY_lemma_list.txt', 'r', encoding='utf-8') as f:
    lemmatext = str (f.read()).split ()
    
stop_words = get_stop_words ('english')
stop_words.extend (['don', 't', 'gon', 'na', 'm', 's', 'part', 'Part', 'I', 'im', 'You', 'you', 'YOU', 'u', 'i', 'It', 'it',
                    'cc', 'CC', 'Cc', 'v', 'stitch', "n't", 'go', 'can', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                    'if', 'If', 'r', 'and', 'And', 'like', 'me', 'Me', 'of', 'Of', 'oh', 'hey', 'a', 'how', 'no', 'are',
                   'in', 'the', 'did', 'do', 'didn', 'ya', 're', 'he', 'or', 'to', 'be', 'is', 'as', 'that', 'tho', 'though',
                   'also', 'la', 've', 'em', 'ta', 'e', 'el', 'ah', 'didnt', 'dont'])

clean = []

for lemma in lemmatext:
    if lemma not in stop_words and not re.match('\W+', lemma):
        clean.append(lemma)
        
lemmatext = clean

pairs = []

for i in range (len(lemmatext)-1):
    if lemmatext[i] != lemmatext[i+1]:
        pair = min (lemmatext[i], lemmatext[i+1]) + '_' + max (lemmatext[i], lemmatext[i+1])
        pairs.append(pair.lower())
        
counter_set_pairs = Counter(pairs).most_common()
counter_set_pairs [:5]

[('american_ask', 177),
 ('america_south', 163),
 ('american_say', 137),
 ('american_think', 127),
 ('english_speak', 101)]

In [None]:
# Поиск по комментариям

for i in all_comments:
    if ('health' in i) & ('care' in i):
        print (i)

In [None]:
# Сохраняем результат как csv-файл для последующего преобразования в граф

csv = 'word1,word2,weight' + '\n'

for count in counter_set_pairs:
    if count[1] > 10:
        word1 = count[0].split('_')[0]
        word2 = count[0].split('_')[1]
        weight = str(count[1])
        csv += word1 + ',' + word2 + ',' + weight + '\n'
        
with open('CommentsPairs.csv', 'w', encoding = 'utf-8') as f:
    f.write(csv)
    
pd.read_csv ('CommentsPairs.csv', encoding = 'utf-8') [:5]

In [25]:
# Выделяем только пары, где упоминается этничность или региональная идентичность

ethnolist = ['russian', 'russia', 'usa', 'american', 'america', 'america', 'state', 'united', 
             'slav', 'slavic', 'baltic', 'easterneuropean', 'lithuania', 'lithuanian', 'polish', 'poland', 
             'ukranian', 'eastern', 'balkan', 'baltic', 'postsoviet', 'yugoslav', 'yugoslavia', 'yugoslavian',
             'estonian', 'estonia', 'romanian', 'romania', 'latvian', 'latvia', 'ukraine', 'ukranian',  
             'czechia', 'czech', 'slovenia', 'slovenian', 'bosnian', 'bosnia', 'bosniaandherzegovina', 'albania', 'albanian', 
            'montenegro', 'montenegrin', 'kosovo', 'kosovar', 'kosovan', 'serbia', 'serbian', 'serb', 
            'bulgaria', 'bulgarian', 'moldova', 'moldovian', 'belarus', 'belarusian', 'georgia', 'georgian', 
             'kazakh', 'kazakhstan', 'armenia', 'armenian']
pairs = []

for i in range (len(lemmatext)-1):
    if (lemmatext[i] in ethnolist) & (lemmatext[i] != lemmatext[i+1]):
        pair = min (lemmatext[i], lemmatext[i+1]) + '_' + max (lemmatext[i], lemmatext[i+1])
        pairs.append(pair.lower())

counter_set_pairs = Counter(pairs).most_common()
counter_set_pairs [:5]

[('american_ask', 152),
 ('american_say', 99),
 ('state_united', 95),
 ('american_think', 93),
 ('american_know', 60)]

Автор кода: Мария Казакова (@undine_su_menulio), marikasakowa@gmail.com 

Москва, 2021 год.