# Начало работы

In [1]:
import pandas as pd
from pathlib import Path
import re
from tqdm import tqdm
from math import ceil
from itertools import chain

tqdm.pandas()

# Идеи для фичей


In [2]:
answers_dir = Path(r"E:\Datasets\cert-answers")
dataset_dir = Path(r"C:\Users\admin\Downloads\r4.2\r4.2")

main_answers_file = answers_dir / "insiders.csv"

assert(answers_dir.is_dir())
assert(dataset_dir.is_dir())
assert(main_answers_file.is_file())

output_dir = Path(r'C:\Users\admin\Google Drive\Datasets\CERT_output')
assert(output_dir.is_dir())

In [3]:
device_cols = next(pd.read_csv(dataset_dir / 'device.csv', chunksize=10)).columns
email_cols = next(pd.read_csv(dataset_dir / 'email.csv', chunksize=10)).columns
file_cols = next(pd.read_csv(dataset_dir / 'file.csv', chunksize=10)).columns
http_cols = next(pd.read_csv(dataset_dir / 'http.csv', chunksize=10)).columns
logon_cols = next(pd.read_csv(dataset_dir / 'logon.csv', chunksize=10)).columns

# Собираем единный словарь

In [4]:
def count_file_lines(file):
    with open(file) as f:
        for count, _ in enumerate(f):
            pass
    return count

def collect_vocabulary(csv_name, chunk_size=100000):
    
    line_count = count_file_lines(dataset_dir / f'{csv_name}.csv')
    
    df_iter = pd.read_csv(dataset_dir / f'{csv_name}.csv', usecols=['date', 'user', 'content'], chunksize=chunk_size)
    
    result_set = set()

    for df in tqdm(df_iter, total=(ceil(line_count / chunk_size))):
        df['content'] = df['content'].str.lower().str.split()
        result_set = result_set.union(*map(set, df['content']))
        
    return result_set

# email_set = collect_vocabulary('email', chunk_size=500000)
# file_set = collect_vocabulary('file', chunk_size=500000)
# http_set = collect_vocabulary('http', chunk_size=500000)

In [5]:
from gensim.models import TfidfModel, nmf
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaModel

def chunk_iterator(filename, chunk_size=10000):
    for chunk in tqdm(pd.read_csv(filename, chunksize=chunk_size)):
        for document in chunk['content'].str.lower().str.split().values:
            yield document
            
def bow_chunk_iterator(filenames, dictionary, chunk_size=10000):
    for filename in filenames:
        print(filename)
        for chunk in tqdm(pd.read_csv(filename, chunksize=chunk_size)):
            for document in chunk['content'].str.lower().str.split().values:
                yield dictionary.doc2bow(document)
                
def tfidf_chunk_iterator(filenames, dictionary, tfidf, chunk_size=10000):
    for filename in filenames:
        print(filename)
        for chunk in tqdm(pd.read_csv(filename, chunksize=chunk_size)):
            for document in chunk['content'].str.lower().str.split().values:
                yield tfidf[dictionary.doc2bow(document)]

In [9]:
df_dict = Dictionary(chunk_iterator(dataset_dir / 'email.csv'))
df_dict.add_documents(chunk_iterator(dataset_dir / 'file.csv'))
df_dict.add_documents(chunk_iterator(dataset_dir / 'http.csv'))

df_dict.save((output_dir / 'content_dictionary.pkl').as_posix())

45it [01:35,  2.12s/it]
2844it [1:40:11,  2.11s/it]


In [6]:
df_dict = Dictionary.load((output_dir / 'content_dictionary.pkl').as_posix())

In [16]:
tfidf = TfidfModel(
    bow_chunk_iterator([
        dataset_dir / 'email.csv',
        dataset_dir / 'file.csv',
        dataset_dir / 'http.csv'
    ], df_dict))

tfidf.save((output_dir / 'tfidf_model.pkl').as_posix())

263it [07:33,  1.72s/it]
45it [01:05,  1.46s/it]
2844it [2:04:00,  2.62s/it]


In [9]:
tfidf = TfidfModel.load((output_dir / 'tfidf_model.pkl').as_posix())

In [10]:
nmf_model = nmf.Nmf(
    tfidf_chunk_iterator([
        dataset_dir / 'email.csv',
        dataset_dir / 'file.csv',
        dataset_dir / 'http.csv'
    ], df_dict, tfidf,
    ),
    num_topics=100
)

nmf_model.save((output_dir / 'nmf_model.pkl').as_posix())

0it [00:00, ?it/s]

C:\Users\admin\Downloads\r4.2\r4.2\email.csv


263it [36:19,  8.29s/it]
0it [00:00, ?it/s]

C:\Users\admin\Downloads\r4.2\r4.2\file.csv


45it [06:20,  8.45s/it]
0it [00:00, ?it/s]

C:\Users\admin\Downloads\r4.2\r4.2\http.csv


2844it [6:22:18,  8.07s/it]


StopIteration: 

In [13]:
lda_model = LdaModel(
    tfidf_chunk_iterator([
        dataset_dir / 'email.csv',
        dataset_dir / 'file.csv',
        dataset_dir / 'http.csv'
    ], df_dict, tfidf,
    ),
    num_topics=100
)

lda_model.save((output_dir / 'lda_model.pkl').as_posix())

0it [00:00, ?it/s]

C:\Users\admin\Downloads\r4.2\r4.2\email.csv


263it [34:12,  7.80s/it]
0it [00:00, ?it/s]

C:\Users\admin\Downloads\r4.2\r4.2\file.csv


45it [05:34,  7.44s/it]
0it [00:00, ?it/s]

C:\Users\admin\Downloads\r4.2\r4.2\http.csv


2844it [7:28:07,  9.45s/it]


In [7]:
lda_model = LdaModel.load((output_dir / 'lda_model.pkl').as_posix())

In [8]:
def process_content(filename, dictionary, model, chunk_size = 10000, postfix='_lda_content'):
    out_file = output_dir / (filename.stem + postfix + '.csv')
    assert(not out_file.is_file())
    for chunk in tqdm(pd.read_csv(filename, usecols=['id', 'content'], chunksize=chunk_size)):
        chunk['content'] = chunk['content']\
            .str.lower()\
            .str.split()\
            .apply(lambda doc: model[dictionary.doc2bow(doc)])

        chunk.to_csv(out_file, mode='a')

In [9]:
process_content(dataset_dir / 'email.csv', df_dict, lda_model)

263it [4:04:18, 55.74s/it]


In [12]:
process_content(dataset_dir / 'file.csv', df_dict, lda_model)

45it [43:19, 57.78s/it]


In [13]:
# credit to https://stackoverflow.com/a/53135031

def parallelize(data, func, num_of_processes=8):
    data_split = np.array_split(data, num_of_processes)
    pool = Pool(num_of_processes)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

def run_on_subset(func, data_subset):
    return data_subset.apply(func, axis=1)

def parallelize_on_rows(data, func, num_of_processes=8):
    return parallelize(data, partial(run_on_subset, func), num_of_processes)

def process_content_parallelized(filename, dictionary, model, chunk_size = 10000, postfix='_lda_content'):
    out_file = output_dir / (filename.stem + postfix + '.csv')
    assert(not out_file.is_file())
    for chunk in tqdm(pd.read_csv(filename, usecols=['id', 'content'], chunksize=chunk_size)):
    
        chunk['content'] = parallelize_on_rows(
            chunk['content'].str.lower().str.split()
            , lambda doc: model[dictionary.doc2bow(doc)])

        chunk.to_csv(out_file, mode='a')

In [14]:
process_content(dataset_dir / 'http.csv', df_dict, lda_model)

2844it [38:47:50, 49.11s/it]
