In [1]:
import pandas as pd

In [16]:
df = pd.read_csv('../../data/feed_actions_last_10.csv', index_col=None, sep=';')

In [17]:
df_feed_actions = df.drop('row_number', axis=1)

In [18]:
# Превратим колонку в pd.datetime
df_feed_actions['time'] = pd.to_datetime(df_feed_actions['time'])

In [19]:
df_feed_actions['month'] = df_feed_actions['time'].dt.month
df_feed_actions['dayofweek'] = df_feed_actions['time'].dt.dayofweek
df_feed_actions['hour'] = df_feed_actions['time'].dt.hour

In [22]:
df_users = pd.read_csv('../../data/users.csv', index_col=None, sep=';')
df_users = df_users.rename(columns={'id': 'user_id'})

In [23]:
df_posts = pd.read_csv('../../data/posts.csv', index_col=None, sep=';')
df_posts = df_posts.rename(columns={'id': 'post_id'})

Объединим таблицы feed_actions, users и posts

In [24]:
df_merged_1 = df_users.merge(df_feed_actions, on='user_id')

In [26]:
df_merged_2 = df_merged_1.merge(df_posts, on='post_id', how='left')

Преобразуем категориальные колонки в числовые

In [27]:
df_2 = df_merged_2

In [29]:
df_2['os'] = df_2['os'].astype('category').cat.codes
df_2['source'] = df_2['source'].astype('category').cat.codes
df_2['city'] = df_2['city'].astype('category').cat.codes

Проверим данные на пропуски

In [30]:
if df_2.isna().any().any():
    print('есть пропуски')
else:
    print('пропусков нет')

пропусков нет


Преобразуем колонки 'topic' и 'country' в числовые, причем по популярности

In [31]:
# Вычисление частотности каждой категории
topic_frequency_map = df_2['topic'].value_counts().to_dict()

topic_frequency_map

{'movie': 1571980,
 'covid': 56752,
 'tech': 1613,
 'sport': 990,
 'politics': 372,
 'entertainment': 154,
 'business': 73}

In [32]:
for top_freq, topic in enumerate(topic_frequency_map.keys()):
    topic_frequency_map[topic] = top_freq

Преобразование категориальной колонки 'topic' в числовую по частотности


In [35]:
df_2['topic_num'] = df_2['topic'].map(topic_frequency_map)

In [36]:
# Вычисление частотности по странам
country_frequency_map = df_2['country'].value_counts().to_dict()

for top_freq, country in enumerate(country_frequency_map.keys()):
    country_frequency_map[country] = top_freq

In [38]:
df_2['country_num'] = df_2['country'].map(country_frequency_map)

In [39]:
df_2['country'] = df_2['country_num']
df_2['topic'] = df_2['topic_num']

df_2.drop(['country_num','topic_num'], axis=1, inplace=True)

Преобразуем таргетную колонку 'action' в числовую: view -> 0, like -> 1 (успех)

In [41]:
df_2['action'].unique()

array(['view', 'like'], dtype=object)

In [42]:
df_2['action'] = df_2['action'].replace(['view', 'like'], [0, 1])

  df_2['action'] = df_2['action'].replace(['view', 'like'], [0, 1])


Стемминг текста

In [44]:
from nltk.stem import PorterStemmer

# Создание экземпляра стеммера Портера из NLTK
stemmer = PorterStemmer()

# Определение функции для стемминга текста
def stem_text(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])

In [45]:
df_text = df_2['text']

df_stemmed_text = df_text.apply(stem_text)

In [49]:
df_text_len = df_2['text'].map(len)
df_2['text_len'] = df_text_len

In [53]:
df_2['text'] = df_stemmed_text

In [54]:
df_2.to_csv('../../data/data_preprocessed.csv', index=False, sep=';')

In [55]:
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer

def preprocessing(line):
    line = line.lower()
    line = re.sub(r"[{}]".format(string.punctuation), " ", line)
    line = line.replace('\n\n', ' ').replace('\n', ' ')
    return line

vectorizer = TfidfVectorizer(
    max_df=0.9, 
    min_df=0.003, 
    stop_words='english',
    preprocessor=preprocessing
)
tfidf_matrix = vectorizer.fit_transform(df_2['text'])

In [56]:
tfidf_matrix

<1631934x4244 sparse matrix of type '<class 'numpy.float64'>'
	with 118255790 stored elements in Compressed Sparse Row format>

In [57]:
df_tfidf = df_2.copy()
df_tfidf['tfidf_sum'] = pd.DataFrame(tfidf_matrix.todense()).sum(axis=1)
df_tfidf['tfidf_mean'] = pd.DataFrame(tfidf_matrix.todense()).mean(axis=1)
df_tfidf['tfidf_max'] = pd.DataFrame(tfidf_matrix.todense()).max(axis=1)
df_tfidf

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,post_id,action,time,month,dayofweek,hour,text,topic,text_len,tfidf_sum,tfidf_mean,tfidf_max
0,200,1,34,0,651,3,0,0,5181,0,2022-01-02 09:44:32,1,6,9,i saw thi film when it wa origin releas in 198...,0,669,6.315479,0.001488,0.375393
1,200,1,34,0,651,3,0,0,5228,0,2022-01-02 09:44:18,1,6,9,alien wa excellent. mani writer tri to copi it...,0,663,5.638878,0.001329,0.491565
2,200,1,34,0,651,3,0,0,5319,0,2022-01-02 09:43:55,1,6,9,"thi amusing, sometim poignant look at the holl...",0,2399,10.482928,0.002470,0.262795
3,200,1,34,0,651,3,0,0,5379,0,2022-01-02 09:43:47,1,6,9,"im start to think that there a conspiracy, all...",0,2813,11.189221,0.002636,0.194437
4,200,1,34,0,651,3,0,0,5361,0,2022-01-02 09:43:02,1,6,9,"more wide-eyed, hyster 50 hyper-ch that give n...",0,4091,12.599472,0.002969,0.266056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1631929,168552,1,16,0,1061,4,0,1,7094,1,2022-01-24 18:53:16,1,0,18,from the open scene of fierc peopl (an interpl...,0,2876,11.687211,0.002754,0.320822
1631930,168552,1,16,0,1061,4,0,1,6846,1,2022-01-24 18:52:46,1,0,18,i have never seen a barbara steel movi that i ...,0,1418,9.575619,0.002256,0.269894
1631931,168552,1,16,0,1061,4,0,1,7097,1,2022-01-24 18:51:53,1,0,18,origin claymat rudolph: pretti good. origin fr...,0,876,6.197824,0.001460,0.477146
1631932,168552,1,16,0,1061,4,0,1,7094,0,2022-01-24 18:51:46,1,0,18,from the open scene of fierc peopl (an interpl...,0,2876,11.687211,0.002754,0.320822


In [58]:
df_tfidf.drop('text', axis=1, inplace=True)

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,post_id,action,time,month,dayofweek,hour,topic,text_len,tfidf_sum,tfidf_mean,tfidf_max
0,200,1,34,0,651,3,0,0,5181,0,2022-01-02 09:44:32,1,6,9,0,669,6.315479,0.001488,0.375393
1,200,1,34,0,651,3,0,0,5228,0,2022-01-02 09:44:18,1,6,9,0,663,5.638878,0.001329,0.491565
2,200,1,34,0,651,3,0,0,5319,0,2022-01-02 09:43:55,1,6,9,0,2399,10.482928,0.002470,0.262795
3,200,1,34,0,651,3,0,0,5379,0,2022-01-02 09:43:47,1,6,9,0,2813,11.189221,0.002636,0.194437
4,200,1,34,0,651,3,0,0,5361,0,2022-01-02 09:43:02,1,6,9,0,4091,12.599472,0.002969,0.266056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1631929,168552,1,16,0,1061,4,0,1,7094,1,2022-01-24 18:53:16,1,0,18,0,2876,11.687211,0.002754,0.320822
1631930,168552,1,16,0,1061,4,0,1,6846,1,2022-01-24 18:52:46,1,0,18,0,1418,9.575619,0.002256,0.269894
1631931,168552,1,16,0,1061,4,0,1,7097,1,2022-01-24 18:51:53,1,0,18,0,876,6.197824,0.001460,0.477146
1631932,168552,1,16,0,1061,4,0,1,7094,0,2022-01-24 18:51:46,1,0,18,0,2876,11.687211,0.002754,0.320822


In [59]:
df_tfidf.to_csv('../../data/data_preprocessed_tfidf.csv', index=False, sep=';')

In [60]:
df_users_likes = df_tfidf.groupby('user_id', as_index=False).agg({'action': 'sum'})
df_users_likes.rename(columns={'action': 'user_likes'}, inplace=True)

Unnamed: 0,user_id,user_likes
0,200,3
1,201,2
2,202,4
3,203,2
4,204,3
...,...,...
163200,168548,1
163201,168549,0
163202,168550,0
163203,168551,1


In [108]:
df_posts_liked = df_tfidf.groupby('post_id', as_index=False).agg({'action': 'sum'})
df_posts_liked.rename(columns={'action': 'post_likes'}, inplace=True)

In [63]:
df_tfidf_grouped = df_tfidf[df_tfidf['action'] == 1].groupby(['user_id', 'topic'], as_index=False).agg({'action': 'count'})
df_tfidf_grouped

Unnamed: 0,user_id,topic,action
0,200,0,3
1,201,0,2
2,202,1,4
3,203,0,2
4,204,0,3
...,...,...,...
146585,168546,0,2
146586,168547,0,2
146587,168548,0,1
146588,168551,0,1


In [64]:
max_indexes = df_tfidf_grouped.groupby('user_id')['action'].idxmax()
df_favourite_topic = df_tfidf_grouped.loc[max_indexes][['user_id', 'topic']]
df_favourite_topic.rename(columns={'topic': 'topic_favourite'}, inplace=True)
df_favourite_topic

Unnamed: 0,user_id,topic_favourite
0,200,0
1,201,0
2,202,1
3,203,0
4,204,0
...,...,...
146585,168546,0
146586,168547,0
146587,168548,0
146588,168551,0


In [65]:
df_new = df_tfidf.merge(df_users_likes, on='user_id', how='left')

In [66]:
df_new = df_new.merge(df_posts_liked, on='post_id', how='left')

In [70]:
df_new = df_new.merge(df_favourite_topic, on='user_id', how='left')

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,post_id,action,...,dayofweek,hour,topic,text_len,tfidf_sum,tfidf_mean,tfidf_max,user_likes,post_likes,topic_favourite
0,200,1,34,0,651,3,0,0,5181,0,...,6,9,0,669,6.315479,0.001488,0.375393,3,107,0.0
1,200,1,34,0,651,3,0,0,5228,0,...,6,9,0,663,5.638878,0.001329,0.491565,3,43,0.0
2,200,1,34,0,651,3,0,0,5319,0,...,6,9,0,2399,10.482928,0.002470,0.262795,3,236,0.0
3,200,1,34,0,651,3,0,0,5379,0,...,6,9,0,2813,11.189221,0.002636,0.194437,3,11,0.0
4,200,1,34,0,651,3,0,0,5361,0,...,6,9,0,4091,12.599472,0.002969,0.266056,3,5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1631929,168552,1,16,0,1061,4,0,1,7094,1,...,0,18,0,2876,11.687211,0.002754,0.320822,3,630,0.0
1631930,168552,1,16,0,1061,4,0,1,6846,1,...,0,18,0,1418,9.575619,0.002256,0.269894,3,2380,0.0
1631931,168552,1,16,0,1061,4,0,1,7097,1,...,0,18,0,876,6.197824,0.001460,0.477146,3,1964,0.0
1631932,168552,1,16,0,1061,4,0,1,7094,0,...,0,18,0,2876,11.687211,0.002754,0.320822,3,630,0.0


In [71]:
df_new['topic_favourite'] = df_new['topic_favourite'].fillna(0)

In [72]:
df_new['topic_favourite'] = df_new['topic_favourite'].astype(int)

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,post_id,action,...,dayofweek,hour,topic,text_len,tfidf_sum,tfidf_mean,tfidf_max,user_likes,post_likes,topic_favourite
0,200,1,34,0,651,3,0,0,5181,0,...,6,9,0,669,6.315479,0.001488,0.375393,3,107,0
1,200,1,34,0,651,3,0,0,5228,0,...,6,9,0,663,5.638878,0.001329,0.491565,3,43,0
2,200,1,34,0,651,3,0,0,5319,0,...,6,9,0,2399,10.482928,0.002470,0.262795,3,236,0
3,200,1,34,0,651,3,0,0,5379,0,...,6,9,0,2813,11.189221,0.002636,0.194437,3,11,0
4,200,1,34,0,651,3,0,0,5361,0,...,6,9,0,4091,12.599472,0.002969,0.266056,3,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1631929,168552,1,16,0,1061,4,0,1,7094,1,...,0,18,0,2876,11.687211,0.002754,0.320822,3,630,0
1631930,168552,1,16,0,1061,4,0,1,6846,1,...,0,18,0,1418,9.575619,0.002256,0.269894,3,2380,0
1631931,168552,1,16,0,1061,4,0,1,7097,1,...,0,18,0,876,6.197824,0.001460,0.477146,3,1964,0
1631932,168552,1,16,0,1061,4,0,1,7094,0,...,0,18,0,2876,11.687211,0.002754,0.320822,3,630,0


In [None]:
df_new.to_csv('../../data/data_preprocessed.csv', index=False, sep=';')

In [145]:
users_features = ['user_id', 'gender', 'age', 'country', 'city', 'exp_group', 'os', 'source', 'user_likes', 'topic_favourite']

df_users_features = df_new[users_features].drop_duplicates()
df_users_features.to_csv('../../data/data_users_features.csv', index=False, sep=';')

In [183]:
posts_features = ['post_id', 'topic', 'text_len', 'tfidf_sum', 'tfidf_mean', 'tfidf_max', 'post_likes']

df_posts_features = df_new[posts_features].drop_duplicates()
df_posts_features.to_csv('../../data/data_posts_features.csv', index=False, sep=';')