In [167]:
import numpy as np
import pandas as pd
from stempel import StempelStemmer
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import copy
from sklearn.feature_extraction.text import CountVectorizer

In [189]:
def load_file(filepath):
    with open(filepath, encoding="utf-8") as f:
        text = f.read()
    return text


def preprocess_text(filepath, stopwords, stemmer, posting_delimiter='********'):
    text = load_file(filepath)
    
    text_transformed = text.replace('\n', ' ').replace('\r', '')
    opisy_ofert = text_transformed.split(posting_delimiter)
    
    opisy_ofert_bez_stopwords = []
    for opis in opisy_ofert:
        opis_bez_stopwords = opis
        for stopword in stopwords:
            opis_bez_stopwords = re.sub(r"\b%s\b" %stopword, '', opis_bez_stopwords)
        opisy_ofert_bez_stopwords.append(opis_bez_stopwords)

    opisy_po_stemmingu = []
    for opis in opisy_ofert_bez_stopwords:
        opisy_po_stemmingu.append([stemmer.stem(word.lower()) for word in opis.split()])

    return opisy_po_stemmingu


def create_tfidf_frame(opisy_ofert, etykiety):

    vectorizer = TfidfVectorizer()
    cechy_tfidf = vectorizer.fit_transform(opisy_ofert)

    tfidf = cechy_tfidf.toarray()
    cechy_df = pd.DataFrame(tfidf, columns=vectorizer.get_feature_names_out())

    # Dodawanie etykiet
    cechy_df['label'] = etykiety

    # Wyświetlanie ramki danych
    return cechy_df


def create_tfidf_columns(df_tfidf):
    tfidf_cols = pd.DataFrame([df_tfidf.iloc[:,:-1].sum(axis=1), df_tfidf.iloc[:,:-1].mean(axis=1)]).T
    tfidf_cols.columns=['tfidf_sum', 'tfidf_mean']
    return tfidf_cols

def create_emotions_columns(opisy_ofert, grouped_emotions):
    # Tworzenie wektora cech
    vectorizer = CountVectorizer()
    cechy = vectorizer.fit_transform(opisy_ofert)

    # Konwersja wektora cech do ramki danych
    df_emotions = pd.DataFrame(cechy.toarray(), columns=vectorizer.get_feature_names_out())

    for column in  list(df_emotions.columns):
        if column in list(grouped_emotions.index):
            df_emotions[column] *= grouped_emotions.loc[column,'emotions']

    emotion_cols = pd.DataFrame([df_emotions.sum(axis=1), df_emotions.mean(axis=1)]).T
    emotion_cols.columns=['emotions_sum', 'emotions_mean']
    return emotion_cols

In [178]:
with open('data\stop_words_polish.txt', encoding="utf-8") as f:
    stopwords = f.readlines()

for i in range(len(stopwords)):
    stopwords[i] = stopwords[i].replace('\n', '')

In [179]:
stemmer = StempelStemmer.polimorf()

Loading: 100%|██████████| 11368252/11368252 [00:15<00:00, 735002.23bytes/s]


In [180]:
opisy_pozytywne = preprocess_text('data\FakeJobHunter - pozytywne.txt', stopwords, stemmer, "Opis ogłoszenia")
opisy_negatywne = preprocess_text('data\FakeJobHunter - negatywne.txt', stopwords, stemmer)

In [181]:
opisy_pozytywne_full = []
opisy_negatywne_full = []

for opis in opisy_pozytywne:
    try:
        opisy_pozytywne_full.append(' '.join(opis))
    except:
        print(opis)
        continue

for opis in opisy_negatywne:
    try:
        opisy_negatywne_full.append(' '.join(opis))
    except:
        print(opis)
        continue

['nativus', 'dzień', 'doradca:', 'mieć', 'bezpośredni', 'kontakt', 'nasze', 'klient', 'zapewniać', 'kompleksowy', 'obsługę,', 'być', 'profesjonalny', 'budować', 'przyjazny', 'relacja', 'klientami,', 'pracować', 'swój', 'premię,', 'prowadzić', 'aktywny', 'sprzedaż', 'produkt', 'usługi', 'bankowy', 'ubezpieczeniowych,', 'przeprowadzać', 'transakcja', 'gotówkowy', 'bezgotówkowe.', 'to', 'stanowisko', 'twoje,', ':', 'interesować', 'branża', 'finansowy', '–', 'chcieć', 'nenen', 'wiedza', 'rozwijać,', 'lubić', 'ludzi.', 'n', 'bać', 'kontakt', 'klient', 'być', 'otwarty/', 'potrzeby,', 'podchodzić', 'entuzjastycznić', 'praca', 'chcieć', 'mść', 'satysfakcja', 'osiągać', 'celów,', 'mieć', 'wykształcić', 'średnia', 'wyższe,', 'mieć', 'doświadczyć', 'praca', 'związać', 'finansami,', 'sprzedaż', 'obsługa', 'klient', '(mila', 'widziane).', 'dołączać', 'otrzymasz:', 'umowa', 'pracę,', 'stały', 'wynagrodzić', 'premię,', 'atrakcyjny', 'pakiet', 'benefit', 'karta', 'multisport,', 'prywatny', 'opieka', '

In [182]:
len(opisy_pozytywne_full), len(opisy_negatywne_full)

(13, 24)

In [183]:
etykiety = [1] * len(opisy_pozytywne_full) + [0] * len(opisy_negatywne_full)
opisy_ofert = opisy_pozytywne_full + opisy_negatywne_full

df_tfidf = create_tfidf_frame(opisy_ofert, etykiety)


In [186]:
tfidf_cols = create_tfidf_columns(df_tfidf)
tfidf_cols

Unnamed: 0,tfidf_sum,tfidf_mean
0,0.0,0.0
1,10.145443,0.006278
2,11.200993,0.006931
3,3.341539,0.002068
4,5.797564,0.003588
5,9.428065,0.005834
6,9.375801,0.005802
7,9.420874,0.00583
8,6.901856,0.004271
9,9.819691,0.006077


In [187]:
grouped_emotions = pd.read_csv('data\grouped_emotions_dictionary.csv')
grouped_emotions.set_index('word', inplace=True)

In [188]:
grouped_emotions

Unnamed: 0_level_0,emotions
word,Unnamed: 1_level_1
$,0
$L,0
$T,0
"1,3-butadien",0
1/2,0
...,...
′,0
″,0
€,0
₴,0


In [190]:
emotion_cols = create_emotions_columns(opisy_ofert, grouped_emotions)
emotion_cols

Unnamed: 0,emotions_sum,emotions_mean
0,0.0,0.0
1,100.0,0.061881
2,117.0,0.072401
3,10.0,0.006188
4,34.0,0.02104
5,82.0,0.050743
6,71.0,0.043936
7,77.0,0.047649
8,48.0,0.029703
9,86.0,0.053218


In [191]:
tfidf_cols.merge(emotion_cols, left_index=True, right_index=True)

Unnamed: 0,tfidf_sum,tfidf_mean,emotions_sum,emotions_mean
0,0.0,0.0,0.0,0.0
1,10.145443,0.006278,100.0,0.061881
2,11.200993,0.006931,117.0,0.072401
3,3.341539,0.002068,10.0,0.006188
4,5.797564,0.003588,34.0,0.02104
5,9.428065,0.005834,82.0,0.050743
6,9.375801,0.005802,71.0,0.043936
7,9.420874,0.00583,77.0,0.047649
8,6.901856,0.004271,48.0,0.029703
9,9.819691,0.006077,86.0,0.053218
