In [1]:
import os
import re
import dotenv
import numpy as np
from numpy import inf
import pandas as pd
from pprint import pprint

In [133]:
# Load environment variables
project_dir = os.path.join(os.path.abspath(''), os.pardir)
dotenv_path = os.path.join(project_dir, '.env')

dotenv.load_dotenv(dotenv_path)

True

In [2]:
df = pd.read_csv('/home/jovyan/git_projects/machine-learning-project/data/processed/daily_measures_features_full.csv', encoding='utf-8')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4168 entries, 0 to 4167
Data columns (total 22 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Message Text               4168 non-null   object 
 1   Date                       4168 non-null   object 
 2   Impressions                4168 non-null   float64
 3   Comments                   381 non-null    float64
 4   Likes                      2364 non-null   float64
 5   Shares                     886 non-null    float64
 6   Other Engagements          2361 non-null   float64
 7   Engagements                2748 non-null   float64
 8   days_since_posted          4168 non-null   float64
 9   fanpage_id                 4168 non-null   float64
 10  ID                         4168 non-null   object 
 11  has_any_image              4168 non-null   bool   
 12  hashtags                   3329 non-null   object 
 13  has_any_hashtag            4168 non-null   bool 

In [4]:
# aggregate everything to sum (leistungsdaten) in order to get 1 row per ID
df_agg = df.groupby(['ID']).agg({
    'Impressions': 'sum',
    'Likes': 'sum',
    'Shares': 'sum',
    'Comments': 'sum',
    'Other Engagements': 'sum',
    'days_since_posted': 'max', # basically as feature more -> better, post acknoledge or good for LI algorithms
    'Engagements': 'sum',
    'Date': 'first',
    'fanpage_id': 'first',
    'has_any_image': 'first',
    'hashtags': 'first',
    'has_any_hashtag': 'first',
    'hour_posted': 'first',
    'dow_posted': 'first',
    'post_len': 'first',
    'timestamp_posted': 'first',
    'Dominant_Topic': 'first',
    'text_clean_and_translated': 'first',
    'Topic_Perc_Contrib': 'first'
}).reset_index()

In [5]:
df_agg = df_agg.rename(columns={
    'days_since_posted': 'days_since_last_collected_data',
    'Impressions': 'sum_imps',
    'Engagements': 'sum_engs',
    'Likes': 'sum_likes',
    'Shares': 'sum_shares',
    'Comments': 'sum_comments',
    'Other Engagements': 'sum_engs_other',
    'Dominant_Topic': 'topic_lda',
    'Topic_Perc_Contrib': 'sure_belongs_to_topic',
    'text_clean_and_translated': 'text_posted',
    'post_len': 'length_text_posted'
})

In [17]:
df_agg.iloc[350:355, :]

Unnamed: 0_level_0,sum_imps,sum_likes,sum_shares,sum_comments,sum_engs_other,days_since_last_collected_data,sum_engs,Date,fanpage_id,has_any_image,...,hour_posted,dow_posted,length_text_posted,timestamp_posted,topic_lda,text_posted,sure_belongs_to_topic,week_posted,minute_posted,day_posted
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
urn:li:share:7026195767409192960,10626.0,196.0,8.0,2.0,192.0,0.0,398.0,2023-01-31 00:00:00,118.0,False,...,15,Tuesday,561,2023-01-31 15:33:31.730,3,with the foundation of the joint venture cofin...,0.8301,5,33,31
urn:li:share:7026205768064413696,10.0,0.0,0.0,0.0,0.0,20.0,0.0,2023-02-20 00:00:00,60447.0,False,...,16,Tuesday,529,2023-01-31 16:13:15.834,0,are you interested in entering a leading techn...,0.84,5,13,31
urn:li:share:7026248677476175873,12.0,0.0,0.0,0.0,0.0,3.0,0.0,2023-02-03 00:00:00,44.0,False,...,19,Tuesday,653,2023-01-31 19:03:46.329,0,another episode of the chemistry and innovatio...,0.4004,5,3,31
urn:li:share:7026265377026256897,18.0,0.0,0.0,0.0,0.0,1.0,0.0,2023-02-01 00:00:00,50.0,False,...,0,Tuesday,306,2023-01-31 00:00:00.000,4,ready to install and ready to move the smart f...,0.4153,5,0,31
urn:li:share:7026293064277110784,501.0,26.0,7.0,0.0,11.0,1.0,44.0,2023-02-01 00:00:00,50.0,False,...,22,Tuesday,168,2023-01-31 22:00:08.898,0,were proud to announce the smart flex effector...,0.6083,5,0,31


In [47]:
df_agg['timestamp_posted'] = pd.to_datetime(df_agg['timestamp_posted'], format='ISO8601')
df_agg['week_posted'] = df_agg['timestamp_posted'].dt.isocalendar().week
df_agg['weekday_posted'] = df_agg['timestamp_posted'].dt.weekday
df_agg['minute_posted'] = df_agg['timestamp_posted'].dt.minute
df_agg['day_posted'] = df_agg['timestamp_posted'].dt.day

In [14]:
df_agg['topic_lda'] = df_agg['topic_lda'].astype(str)

In [15]:
df_agg = df_agg.set_index('ID')

In [16]:
df_agg.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4168 entries, urn:li:share:7015109880122384384 to urn:li:ugcPost:7091797022185295873
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   sum_imps                        4168 non-null   float64       
 1   sum_likes                       4168 non-null   float64       
 2   sum_shares                      4168 non-null   float64       
 3   sum_comments                    4168 non-null   float64       
 4   sum_engs_other                  4168 non-null   float64       
 5   days_since_last_collected_data  4168 non-null   float64       
 6   sum_engs                        4168 non-null   float64       
 7   Date                            4168 non-null   object        
 8   fanpage_id                      4168 non-null   float64       
 9   has_any_image                   4168 non-null   bool          
 10  hashtags        

In [19]:
from keybert import KeyBERT

kw_model = KeyBERT()
extract_keywords = lambda doc: kw_model.extract_keywords(doc)

2023-10-13 09:59:32.080797: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-13 09:59:32.901493: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib/:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-10-13 09:59:32.901623: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib/:/usr/local/nvidia/lib:/usr/local/nvidia/lib64


Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [20]:
df_agg['bert_keywords'] = df_agg.text_posted.apply(extract_keywords)

In [21]:
df_agg['bert_keywords'] = df_agg.bert_keywords.apply(lambda x: ','.join([j[0] for j in x]))

In [22]:
df_agg['eng_rate'] = df_agg.sum_engs / df_agg.sum_imps

In [49]:
# add threshold to exclude new posts and ones with odd data
df_agg_threshold = df_agg[(df_agg.sum_imps >= 50) & (df_agg.days_since_last_collected_data > 0)]

In [50]:
# since sometimes it is impossible to translate hashtags, we will replace the feature with hashtags count
df_agg_threshold.loc[:, 'hashtags_count'] = df_agg_threshold.hashtags.str.split(',').apply(lambda x: len(x) if x else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_agg_threshold.loc[:, 'hashtags_count'] = df_agg_threshold.hashtags.str.split(',').apply(lambda x: len(x) if x else 0)


In [51]:
df_agg_threshold[df_agg_threshold['eng_rate'] >= 1.0]

Unnamed: 0_level_0,sum_imps,sum_likes,sum_shares,sum_comments,sum_engs_other,days_since_last_collected_data,sum_engs,Date,fanpage_id,has_any_image,...,topic_lda,text_posted,sure_belongs_to_topic,week_posted,minute_posted,day_posted,bert_keywords,eng_rate,weekday_posted,hashtags_count
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [52]:
X = df_agg_threshold.loc[:, [
    'has_any_image',
    'hashtags_count',
    'has_any_hashtag',
    'week_posted',
    'weekday_posted',
    'day_posted',
    'hour_posted',
    'minute_posted',
    'fanpage_id',
    #'timestamp_posted',
    'length_text_posted',
    'topic_lda',
    'sure_belongs_to_topic',
    'bert_keywords'
]]
y = df_agg_threshold[['eng_rate']]

In [53]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1618 entries, urn:li:share:7015210047886524416 to urn:li:ugcPost:7090698100771549185
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   has_any_image          1618 non-null   bool   
 1   hashtags_count         1618 non-null   int64  
 2   has_any_hashtag        1618 non-null   bool   
 3   week_posted            1618 non-null   UInt32 
 4   weekday_posted         1618 non-null   int32  
 5   day_posted             1618 non-null   int32  
 6   hour_posted            1618 non-null   int64  
 7   minute_posted          1618 non-null   int32  
 8   fanpage_id             1618 non-null   float64
 9   length_text_posted     1618 non-null   int64  
 10  topic_lda              1618 non-null   object 
 11  sure_belongs_to_topic  1618 non-null   float64
 12  bert_keywords          1618 non-null   object 
dtypes: UInt32(1), bool(2), float64(2), int32(3), int64(3

In [54]:
# One hot encoding for all categorical variables (decision tree needed)
X_dt = X.copy()

In [55]:
import unicodedata as ud
X_dt = pd.concat([
   X_dt,
   X_dt.topic_lda.str.get_dummies().add_prefix(f'lda_topic_'),
    X_dt.topic_lda.str.get_dummies().add_prefix(f'lda_topic_'),
    'week_posted', 'day_posted', 'hour_posted', 'minute_posted', 'weekday_posted'
   pd.get_dummies(X_dt.bert_keywords.str.split(',').explode()).groupby(level=0).sum()
], axis=1).drop(columns=['topic_lda', 'bert_keywords'])

In [57]:
X_dt.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1618 entries, urn:li:share:7015210047886524416 to urn:li:ugcPost:7090698100771549185
Columns: 3309 entries, has_any_image to 新春快乐
dtypes: UInt32(1), float64(2), int32(3), int64(3303)
memory usage: 40.8+ MB


In [58]:
X_dt

Unnamed: 0_level_0,has_any_image,hashtags_count,has_any_hashtag,week_posted,weekday_posted,day_posted,hour_posted,minute_posted,fanpage_id,length_text_posted,...,youth,youthen,yuan,zaragoza,zealand,zeitung,zhaoting,znozsvi6qc,zodiac,新春快乐
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
urn:li:share:7015210047886524416,1,0,0,52,6,1,8,0,118.0,330,...,0,0,0,0,0,0,0,0,0,0
urn:li:share:7015240414274158592,1,0,0,52,6,1,10,0,2165.0,189,...,0,0,0,0,0,0,0,0,0,0
urn:li:share:7015593540680351744,1,0,0,1,0,2,9,24,3041.0,493,...,0,0,0,0,0,0,0,0,0,0
urn:li:share:7015632542909276162,1,0,0,1,0,2,11,59,2165.0,600,...,0,0,0,0,0,0,0,0,0,0
urn:li:share:7015768702277066752,1,2,1,1,0,2,21,0,50.0,242,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
urn:li:ugcPost:7090219491409731584,0,1,1,30,3,27,8,40,118.0,826,...,0,0,0,0,0,0,0,0,0,0
urn:li:ugcPost:7090288923028926464,0,1,1,30,3,27,13,16,118.0,787,...,0,0,0,0,0,0,0,0,0,0
urn:li:ugcPost:7090310591004725249,0,2,1,30,3,27,14,42,7958.0,626,...,0,0,0,0,0,0,0,0,0,0
urn:li:ugcPost:7090350415921205248,0,0,0,30,3,27,17,21,113.0,531,...,0,0,0,0,0,0,0,0,0,0


In [59]:
y.describe()

Unnamed: 0,eng_rate
count,1618.0
mean,0.068354
std,0.084429
min,0.0
25%,0.021999
50%,0.0417
75%,0.078125
max,0.792905


In [105]:
X.to_csv('/home/jovyan/git_projects/machine-learning-project/data/processed/X_catboost.csv', encoding='utf-8', index=True)
X_dt.to_csv('/home/jovyan/git_projects/machine-learning-project/data/processed/X_decision_trees.csv', encoding='utf-8', index=True)
y.to_csv('/home/jovyan/git_projects/machine-learning-project/data/processed/y_both.csv', encoding='utf-8', index=True)