In [1]:
import pandas as pd
import pymorphy2 as pm
import numpy as np
import stanza as st
import ast

In [56]:
df = pd.read_csv('data/vk_posts.csv')

In [18]:
df.rename(columns = {'date':'date_unit'}, inplace = True)

In [20]:
df['date'] = pd.to_datetime(df['date_unit'], unit='s') + pd.Timedelta('08:00:00')

In [23]:
df['year'] = df['date'].dt.year

In [24]:
df['month'] = df['date'].dt.month

In [26]:
seasons = {1:'зима',2:'зима',3:"весна",4:"весна",5:"весна",6:"лето",7:"лето",8:"лето",9:"осень",10:"осень",11:"осень",12:'зима'}

In [116]:
df['seasons'] = df.month.apply(lambda x: seasons[x])

SyntaxError: invalid syntax (<ipython-input-116-0debc2416d32>, line 1)

In [190]:
df.to_csv('data/posts_vk.csv', index = False)

In [130]:
df.loc[~df.attachments.isna(), 'items_attachments'] = df.loc[~df.attachments.isna()].attachments.apply(lambda x: [el['type'] for el in ast.literal_eval(x) ])

In [134]:
df.loc[~df.attachments.isna(), 'count_photo'] = df.loc[~df.attachments.isna()].items_attachments.apply(lambda x: x.count('photo'))
df.loc[~df.attachments.isna(), 'count_video'] = df.loc[~df.attachments.isna()].items_attachments.apply(lambda x: x.count('video'))
df.loc[~df.attachments.isna(), 'count_audio'] = df.loc[~df.attachments.isna()].items_attachments.apply(lambda x: x.count('audio'))

In [129]:
[el['type'] for el in  ast.literal_eval(df.loc[df.attachments.str.contains('video', na = False)].loc[27, 'attachments'])]

['photo', 'photo', 'photo', 'photo', 'photo', 'photo', 'photo', 'video']

In [136]:
df.columns

Index(['id', 'from_id', 'owner_id', 'date_unit', 'marked_as_ads', 'post_type',
       'text', 'signer_id', 'attachments', 'short_text_rate',
       'carousel_offset', 'hash', 'post_source.platform', 'post_source.type',
       'comments.can_post', 'comments.count', 'likes.can_like', 'likes.count',
       'likes.user_likes', 'likes.can_publish', 'reposts.count',
       'reposts.user_reposted', 'views.count', 'donut.is_donut', 'edited',
       'copy_history', 'copyright.id', 'copyright.link', 'copyright.type',
       'copyright.name', 'post_source.data', 'zoom_text', 'geo.type',
       'geo.coordinates', 'geo.place.created', 'geo.place.id',
       'geo.place.is_deleted', 'geo.place.latitude', 'geo.place.longitude',
       'geo.place.title', 'geo.place.total_checkins', 'geo.place.updated',
       'geo.place.category', 'geo.place.country',
       'geo.place.category_object.id', 'geo.place.category_object.title',
       'geo.place.category_object.icons', 'geo.place.city', 'date', 'year',
   

# text processing

In [161]:
words = df.text.str.lower().str.extractall(r'([a-zа-яё\d\-]+)').reset_index('match').merge(df[['id']], how = 'left', left_index = True, right_index = True)

In [162]:
words['length'] = words[0].str.len()

In [163]:
words.rename(columns = {'match':'number_word', 0:'word'}, inplace = True)

In [44]:
morph =  pm.MorphAnalyzer()

In [164]:
result = []

In [165]:
words.word.drop_duplicates().apply(lambda x: result.append({'normal_form':morph.parse(x)[0].normal_form, \
                                                                'tag':morph.parse(x)[0].tag,
                                                                'grammema':morph.parse(x)[0].tag.POS,
                                                                'word':x}) )

0       None
0       None
1       None
1       None
1       None
        ... 
6403    None
6403    None
6403    None
6403    None
6448    None
Name: word, Length: 12710, dtype: object

In [166]:
words = words.merge(pd.DataFrame(result), how = 'left', on = 'word')

In [167]:
words.loc[words.length>2].drop(columns = ['id', 'number_word', 'word']).value_counts().reset_index().to_csv('data/words_counts.csv', index = False)

In [82]:
words.loc[words.length>2].to_csv('data/all_words.csv', index = False)

In [185]:
words = words.merge(df[['id','date', 'year', 'month', 'seasons', 'geo.place.latitude', 'geo.place.longitude', 'geo.place.category',\
               'geo.place.title', 'count_photo', 'count_video', 'count_audio','comments.count', 'reposts.count', 'likes.count']],\
           how = 'left', on = 'id')
# .to_csv('data/all_words.csv', index = False)

# NER processing

In [85]:
st.download('ru')

HBox(children=(FloatProgress(value=0.0, description='Downloading https://raw.githubusercontent.com/stanfordnlp…

2022-04-19 13:37:03 INFO: Downloading default packages for language: ru (Russian)...





2022-04-19 13:37:05 INFO: File exists: C:\Users\yupes\stanza_resources\ru\default.zip.
2022-04-19 13:37:14 INFO: Finished downloading models and saved to C:\Users\yupes\stanza_resources.


In [86]:
nlp = st.Pipeline(lang='ru', processors='tokenize,ner,pos,lemma,depparse')

2022-04-19 13:37:14 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| pos       | syntagrus |
| lemma     | syntagrus |
| depparse  | syntagrus |
| ner       | wikiner   |

2022-04-19 13:37:14 INFO: Use device: cpu
2022-04-19 13:37:14 INFO: Loading: tokenize
2022-04-19 13:37:14 INFO: Loading: pos
2022-04-19 13:37:15 INFO: Loading: lemma
2022-04-19 13:37:15 INFO: Loading: depparse
2022-04-19 13:37:15 INFO: Loading: ner
2022-04-19 13:37:17 INFO: Done loading processors!


In [100]:
def get_loc(elements):
    text, ind = elements
    try:
        doc = nlp(text)
        for sent in doc.sentences:
            for ent in sent.ents:
                if ent.type=='LOC':
                    locs.append({'ner':ent.text, 'index_review':ind})
    except:
        pass

In [101]:
locs = []

In [102]:
df.loc[~df.text.isna()][['text', 'id']].apply(lambda x: get_loc(x), axis = 1)

0       None
1       None
2       None
4       None
5       None
        ... 
6447    None
6448    None
6449    None
6450    None
6451    None
Length: 2009, dtype: object

In [141]:
df_ner = pd.DataFrame(locs).merge(df[['id','date', 'year', 'month', 'seasons', 'geo.place.latitude', 'geo.place.longitude', 'geo.place.category',\
               'geo.place.title', 'count_photo', 'count_video', 'count_audio','comments.count', 'reposts.count', 'likes.count']]\
                                  .rename(columns = {'id':'index_review'}), how = 'left', on = 'index_review')

In [251]:
pd.DataFrame(locs).ner.value_counts().reset_index()

Unnamed: 0,index,ner
0,Байкал,322
1,Байкала,183
2,Байкале,141
3,Ольхон,97
4,Ангара,48
...,...,...
1128,Байкальский котяра,1
1129,Гроты Байкала,1
1130,"КАМЕННАЯ ПАЛИТРА""",1
1131,Малая Коса,1


In [252]:
pd.DataFrame(locs).to_csv('data/ner.csv', index = False)

In [253]:
df_ner = pd.read_csv('data/ner.csv')

In [254]:
words_temp = words.merge(df_ner.rename(columns = {'index_review':'id'})[['id', 'ner']], how = 'left', on = 'id')

In [255]:
words_temp['is_toponim'] = False

In [256]:
words_temp.loc[~words_temp.ner.isna(), 'is_toponim'] = words_temp.loc[~words_temp.ner.isna()][['word', 'ner']].apply(lambda x: ' '+x[0]+' ' in (' '+x[1]+' ').lower(), axis = 1)

In [257]:
words_temp.normal_form.replace('кбжда', "кбжд", inplace = True)
words_temp.normal_form.replace('ангар', "ангара", inplace = True)

In [258]:
words_temp.loc[(words_temp.tag.astype(str).str.contains('Geox sing', na = False)) & \
                  (words_temp.is_toponim==False), 'ner'] = words_temp.loc[(words_temp.tag.astype(str).str.contains('Geox sing', na = False)) & \
                  (words_temp.is_toponim==False), 'word'].str.capitalize()

In [259]:
words_temp.loc[(words_temp.tag.astype(str).str.contains('Geox sing', na = False)) & \
                  (words_temp.is_toponim==False), 'is_toponim'] = True

In [263]:
words_temp['key'] = words_temp['id'].astype(str)+'_'+words_temp['number_word'].astype(str)

In [266]:
words_temp.drop_duplicates(['key', 'ner']).ner.value_counts().reset_index().to_csv('data/ner_for_change.csv', index = False)

In [192]:
words_temp.loc[words_temp.is_toponim== True]

Unnamed: 0,number_word,word,id,length,normal_form,tag,grammema,date,year,month,...,geo.place.category,geo.place.title,count_photo,count_video,count_audio,comments.count,reposts.count,likes.count,ner,is_toponim
0,0,март,43068,4,март,"NOUN,inan,masc sing,nomn",NOUN,2022-04-11 12:45:50,2022,4,...,,,3.0,0.0,0.0,6,16,360,Март,True
72,70,россии,43048,6,россия,"NOUN,inan,femn,Sgtm,Geox sing,gent",NOUN,2022-04-10 21:18:05,2022,4,...,,,1.0,0.0,0.0,3,3,19,России,True
91,0,байкал,43031,6,байкал,"NOUN,inan,masc,Geox sing,nomn",NOUN,2022-04-10 12:06:22,2022,4,...,,,9.0,0.0,0.0,23,127,2557,Байкал,True
92,0,сарминское,42982,10,сарминский,"ADJF neut,sing,nomn",ADJF,2022-04-08 21:16:18,2022,4,...,,,3.0,0.0,0.0,8,10,457,Сарминское ущелье,True
95,1,ущелье,42982,6,ущелие,"NOUN,inan,neut sing,nomn",NOUN,2022-04-08 21:16:18,2022,4,...,,,3.0,0.0,0.0,8,10,457,Сарминское ущелье,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544549,0,лик,2106,3,лик,"NOUN,inan,masc sing,accs",NOUN,2017-10-15 21:18:43,2017,10,...,,,1.0,0.0,0.0,24,50,1813,Лик Байкала,True
544550,1,байкала,2106,7,байкал,"NOUN,inan,masc,Geox sing,gent",NOUN,2017-10-15 21:18:43,2017,10,...,,,1.0,0.0,0.0,24,50,1813,Лик Байкала,True
544551,0,иркутск,2098,7,иркутск,"NOUN,inan,masc,Geox sing,nomn",NOUN,2017-10-15 20:19:00,2017,10,...,,,1.0,0.0,0.0,40,31,1622,Иркутск,True
544552,0,тёплые,2090,6,тёплый,"ADJF,Qual plur,nomn",ADJF,2017-10-15 17:19:00,2017,10,...,,,1.0,0.0,0.0,12,47,1449,Тёплые озёра,True


In [237]:
list(words_temp.ner.unique())

['Март',
 'России',
 'Байкал',
 'Сарминское ущелье',
 'Сарма',
 'Оз.Байкал',
 'Северобайкальск',
 'Листвянка Байкал',
 'Иркут',
 'ББТ',
 nan,
 'Ангары',
 'Лёд Байкала',
 'Байкала',
 'Ольхона',
 'Ольхон',
 'Бухта Песчаная',
 'Листвянкой',
 'Большими Котами',
 'Кынгарге',
 'Аршан',
 'Хамар-Дабан',
 'Мамай',
 'р. Бабха',
 'Космический лёд Байкала',
 'Большом Голоустном',
 'Сибирские просторы',
 'Транссибе',
 'Слюдянка',
 'Андриановская',
 'Ангасольская',
 'Медлянская петли',
 'Транссиба',
 'Иркутска',
 'Слюдянки',
 'Нижний Прибайкальский тоннель',
 'Верхний Прибайкальский тоннель',
 'Новогодний Байкал',
 'Сахюрта',
 'Октябрь',
 'Байкальской Горной Страны',
 'Декабрь',
 'Большое Голоустное',
 'Листвянка',
 'Большая Байкальская Тропа',
 'Кадилинские пещеры',
 'пещеру Часовня',
 'Сарайском пляже',
 'Слюдяные штольни - Село Байкальское"',
 'Красный Яр',
 'Иркутском',
 'Малого моря',
 'Восточный Саян',
 'Баргузинская долина',
 'Еланцов',
 'Сахюрты',
 'Шебета',
 'Анга',
 'Курыканской стены',
 '

In [313]:
words_temp_v2 = words_temp.copy()
words_temp_v2.loc[words_temp_v2.is_toponim== False, 'ner'] = np.nan
words_temp_v2.loc[words_temp_v2.is_toponim== False, 'ner_norm'] = np.nan


In [314]:
words_temp_v2 = words_temp_v2.loc[~words_temp_v2.index.isin(words_temp_v2.loc[(words_temp_v2.key.isin(words_temp_v2.loc[words_temp_v2.is_toponim == True, 'key']) ) &\
               (words_temp_v2.is_toponim==False)].index)]

In [211]:
words_temp.loc[words_temp['id']==42919]

Unnamed: 0,number_word,word,id,length,normal_form,tag,grammema,date,year,month,...,geo.place.title,count_photo,count_video,count_audio,comments.count,reposts.count,likes.count,ner,is_toponim,key
146,0,оз,42919,2,оз,"NOUN,inan,masc sing,nomn",NOUN,2022-04-05 15:56:01,2022,4,...,,6.0,0.0,0.0,20,49,1654,,False,42919_0
147,0,оз,42919,2,оз,"NOUN,inan,masc sing,nomn",NOUN,2022-04-05 15:56:01,2022,4,...,,6.0,0.0,0.0,20,49,1654,,False,42919_0
148,1,байкал,42919,6,байкал,"NOUN,inan,masc,Geox sing,nomn",NOUN,2022-04-05 15:56:01,2022,4,...,,6.0,0.0,0.0,20,49,1654,,False,42919_1
149,1,байкал,42919,6,байкал,"NOUN,inan,masc,Geox sing,nomn",NOUN,2022-04-05 15:56:01,2022,4,...,,6.0,0.0,0.0,20,49,1654,,False,42919_1
150,2,г,42919,1,г,"NOUN,inan,masc,Fixd,Abbr sing,gent",NOUN,2022-04-05 15:56:01,2022,4,...,,6.0,0.0,0.0,20,49,1654,,False,42919_2
151,2,г,42919,1,г,"NOUN,inan,masc,Fixd,Abbr sing,gent",NOUN,2022-04-05 15:56:01,2022,4,...,,6.0,0.0,0.0,20,49,1654,,False,42919_2
152,3,северобайкальск,42919,15,северобайкальск,"NOUN,inan,masc,Geox sing,nomn",NOUN,2022-04-05 15:56:01,2022,4,...,,6.0,0.0,0.0,20,49,1654,,False,42919_3
153,3,северобайкальск,42919,15,северобайкальск,"NOUN,inan,masc,Geox sing,nomn",NOUN,2022-04-05 15:56:01,2022,4,...,,6.0,0.0,0.0,20,49,1654,Северобайкальск,True,42919_3


In [219]:
words_temp_v2.loc[(words_temp_v2.tag.astype(str).str.contains('Geox sing', na = False)) & \
                  (words_temp_v2.is_toponim==False), 'word']

148         байкал
149         байкал
4127        байкал
4593        дугане
4594        дугане
            ...   
534189      байкал
534447    изюбриха
534451      байкал
534720       шикша
544555      ольхон
Name: word, Length: 775, dtype: object

In [306]:
words_temp.loc[words_temp.ner_norm.isna(), 'is_toponim'] = np.nan

In [315]:
words_temp_v2.drop_duplicates(['id', 'number_word', 'is_toponim', 'ner', 'ner_norm']).to_csv('data/all_words.csv', index = False)

In [309]:
words_temp.to_csv('data/link_words_with_toponims.csv', index = False)

In [245]:
words_temp['ner_v2'] = words_temp.loc[~words_temp.ner.isna()].ner.apply(lambda x: morph.normal_forms(x)[0])

In [291]:
df.loc[df.text.str.contains('Даурские', na= False)].text.tolist()

['Даурские галки']

In [38]:
df_ner = pd.read_excel('data/new_ner.xlsx')

In [2]:
df = pd.read_csv('data/posts_vk.csv')

In [300]:
words_temp = words_temp.merge(df_ner[['ner', 'ner_norm']], how  = 'left', on = 'ner')

In [1]:
words_temp.columns

NameError: name 'words_temp' is not defined

# схожесть текстов

In [2]:
import difflib

In [4]:
difflib.SequenceMatcher(None, 'мыс шаманка', 'шаманка').ratio()

0.7777777777777778

In [27]:
matrix = pd.DataFrame(index = df_ner[['ner_norm']].drop_duplicates().dropna().ner_norm, columns = df_ner[['ner_norm']].drop_duplicates().dropna().ner_norm.tolist())

In [28]:
for c in matrix.columns:
    for i in matrix.index:
        if i!=c:
            matrix.loc[i, c] = difflib.SequenceMatcher(None, i, c).ratio()

In [37]:
matrix.fillna(0).idxmax().reset_index().to_csv('data/исправление ошибок.csv', index = False)

In [39]:
words = pd.read_csv('data/all_words.csv')
words2 = pd.read_csv('data/link_words_with_toponims.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [52]:
words2.is_toponim.fillna(False, inplace = True)

In [53]:
for ner, ner_norm in zip(df_ner.ner, df_ner.ner_norm):
    words2.loc[words2.ner==ner, 'ner_norm'] = ner_norm

In [54]:
words2['ner_norm'] = words2.ner_norm.str[0].str.upper()+words2.ner_norm.str[1:]

In [51]:
words.to_csv('data/all_words.csv', index = False)

In [55]:
words2.to_csv('data/link_words_with_toponims.csv', index = False)

In [63]:
words.loc[words['id']==42636][['number_word', 'word', 'id', 'length', 'normal_form', 'ner', 'ner_norm', 'is_toponim']]

Unnamed: 0,number_word,word,id,length,normal_form,ner,ner_norm,is_toponim
263,0,байкал,42636,6,байкал,Байкал,Байкал,True
264,1,остров,42636,6,остров,,,False
265,2,ольхон,42636,6,ольхон,Ольхон,Ольхон,True
266,3,февраль,42636,7,февраль,,,False
267,4,2022,42636,4,2022,,,False


In [65]:
words2.drop_duplicates().loc[words2['id']==42636][['number_word', 'word', 'id', 'length', 'normal_form', 'ner', 'ner_norm', 'is_toponim']]

Unnamed: 0,number_word,word,id,length,normal_form,ner,ner_norm,is_toponim
464,0,байкал,42636,6,байкал,Байкал,Байкал,True
466,1,остров,42636,6,остров,Байкал,Байкал,False
467,1,остров,42636,6,остров,Ольхон,Ольхон,False
468,2,ольхон,42636,6,ольхон,Ольхон,Ольхон,True
470,3,февраль,42636,7,февраль,Байкал,Байкал,False
471,3,февраль,42636,7,февраль,Ольхон,Ольхон,False
472,4,2022,42636,4,2022,Байкал,Байкал,False
473,4,2022,42636,4,2022,Ольхон,Ольхон,False


In [7]:
pd.concat([df.drop_duplicates(['text', 'signer_id']), df.loc[df.text.isna()]], ignore_index = True).drop_duplicates('id')

Unnamed: 0,id,from_id,owner_id,date_unit,marked_as_ads,post_type,text,signer_id,attachments,short_text_rate,...,geo.place.category_object.icons,geo.place.city,date,year,month,seasons,count_photo,items_attachments,count_video,count_audio
0,43068,-95467299,-95467299,1649652350,0,post,"Март, 2022г.",37137538.0,"[{'type': 'photo', 'photo': {'album_id': -7, '...",0.8,...,,,2022-04-11 12:45:50,2022,4,весна,3.0,"['photo', 'photo', 'photo']",0.0,0.0
1,43048,-95467299,-95467299,1649596685,0,post,"ВНИМАНИЕ, РОЗЫГРЫШ❗🎁🎉\nВ честь открытия нашей ...",192738006.0,"[{'type': 'photo', 'photo': {'album_id': -7, '...",0.8,...,,,2022-04-10 21:18:05,2022,4,весна,1.0,['photo'],0.0,0.0
2,43031,-95467299,-95467299,1649563582,0,post,Байкал 💙💙💙,632499857.0,"[{'type': 'photo', 'photo': {'album_id': -7, '...",0.8,...,,,2022-04-10 12:06:22,2022,4,весна,9.0,"['photo', 'photo', 'photo', 'photo', 'photo', ...",0.0,0.0
3,43006,-95467299,-95467299,1649473761,0,post,,561607471.0,"[{'type': 'photo', 'photo': {'album_id': -7, '...",0.8,...,,,2022-04-09 11:09:21,2022,4,весна,7.0,"['photo', 'photo', 'photo', 'photo', 'photo', ...",0.0,0.0
4,42982,-95467299,-95467299,1649423778,0,post,"Сарминское ущелье и река Сарма , зимой так же ...",463663055.0,"[{'type': 'photo', 'photo': {'album_id': -7, '...",0.8,...,,,2022-04-08 21:16:18,2022,4,весна,3.0,"['photo', 'photo', 'photo']",0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6238,2077,-95467299,-95467299,1508041168,0,post,,,"[{'type': 'video', 'video': {'access_key': '24...",0.8,...,,,2017-10-15 12:19:28,2017,10,осень,0.0,['video'],1.0,0.0
6239,2054,-95467299,-95467299,1507985833,0,post,,,"[{'type': 'photo', 'photo': {'album_id': -7, '...",0.8,...,,,2017-10-14 20:57:13,2017,10,осень,1.0,['photo'],0.0,0.0
6240,2053,-95467299,-95467299,1507979515,0,post,,,"[{'type': 'photo', 'photo': {'album_id': -7, '...",0.8,...,,,2017-10-14 19:11:55,2017,10,осень,1.0,['photo'],0.0,0.0
6241,2051,-95467299,-95467299,1507976640,0,post,,,"[{'type': 'photo', 'photo': {'album_id': -7, '...",0.8,...,,,2017-10-14 18:24:00,2017,10,осень,1.0,['photo'],0.0,0.0


In [11]:
df.loc[df.text.isna()].loc[3]

id                                                                             43006
from_id                                                                    -95467299
owner_id                                                                   -95467299
date_unit                                                                 1649473761
marked_as_ads                                                                      0
post_type                                                                       post
text                                                                             NaN
signer_id                                                                561607471.0
attachments                        [{'type': 'photo', 'photo': {'album_id': -7, '...
short_text_rate                                                                  0.8
carousel_offset                                                                  0.0
hash                                                             