# Eksploracja i preprocessing danych zbioru "Bag of Words Data Set"
Julia Kaznowska, Piotr Wilczyński <br>
Politechnika Warszawska, Wydział Matematyki i Nauk Informacyjnych, Wstęp do uczenia maszynowego

## Import niezbędnych bibliotek

In [1]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
from collections import Counter
from matplotlib import pyplot as plt
import pickle
from scipy.sparse import lil_matrix
from scipy.sparse import csr_matrix
from itertools import chain
import tqdm

# wyświetlanie wizualizacji
%matplotlib inline

In [2]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

## Import danych i preprocessing

In [3]:
enron_vocab = []
with open('data/vocab.enron.txt', 'r', encoding='utf-8') as file:
    for line in file:
        enron_vocab.append(line.strip())
        
kos_vocab = []
with open('data/vocab.kos.txt', 'r', encoding='utf-8') as file:
    for line in file:
        kos_vocab.append(line.strip())
        
nips_vocab = []
with open('data/vocab.nips.txt', 'r', encoding='utf-8') as file:
    for line in file:
        nips_vocab.append(line.strip())
        
nytimes_vocab = []
with open('data/vocab.nytimes.txt', 'r', encoding='utf-8') as file:
    for line in file:
        nytimes_vocab.append(line.strip())
        
pubmed_vocab = []
with open('data/vocab.pubmed.txt', 'r', encoding='utf-8') as file:
    for line in file:
        pubmed_vocab.append(line.strip())

In [52]:
with open('data/docword.enron.txt', 'r', encoding='utf-8') as file:
    enron_D = int(file.readline())
    enron_W = int(file.readline())
    enron_NNZ = int(file.readline())
    enron_dict_list = np.array([{} for _ in range(enron_D)])
    for line in file:
        row = line.strip().split()
        doc_id = int(row[0]) - 1
        word_id = int(row[1]) - 1
        count = int(row[2])
        enron_dict_list[doc_id][enron_vocab[word_id]] = count
        
with open('data/docword.kos.txt', 'r', encoding='utf-8') as file:
    kos_D = int(file.readline())
    kos_W = int(file.readline())
    kos_NNZ = int(file.readline())
    kos_dict_list = np.array([{} for _ in range(kos_D)])
    for line in file:
        row = line.strip().split()
        doc_id = int(row[0]) - 1
        word_id = int(row[1]) - 1
        count = int(row[2])
        kos_dict_list[doc_id][kos_vocab[word_id]] = count
        
with open('data/docword.nips.txt', 'r', encoding='utf-8') as file:
    nips_D = int(file.readline())
    nips_W = int(file.readline())
    nips_NNZ = int(file.readline())
    nips_dict_list = np.array([{} for _ in range(nips_D)])
    for line in file:
        row = line.strip().split()
        doc_id = int(row[0]) - 1
        word_id = int(row[1]) - 1s
        count = int(row[2])
        nips_dict_list[doc_id][nips_vocab[word_id]] = count
        
with open('data/docword.nytimes.txt', 'r', encoding='utf-8') as file:
    nytimes_D = int(file.readline())
    nytimes_W = int(file.readline())
    nytimes_NNZ = int(file.readline())
    nytimes_dict_list = np.array([{} for _ in range(nytimes_D)])
    for line in file:
        row = line.strip().split()
        doc_id = int(row[0]) - 1
        word_id = int(row[1]) - 1
        count = int(row[2])
        nytimes_dict_list[doc_id][nytimes_vocab[word_id]] = count
        
with open('data/docword.pubmed.txt', 'r', encoding='utf-8') as file:
    pubmed_D = int(file.readline())
    pubmed_W = int(file.readline())
    pubmed_NNZ = int(file.readline())
    pubmed_dict_list = np.array([{} for _ in range(pubmed_D)])
    for line in file:
        row = line.strip().split()
        doc_id = int(row[0]) - 1
        word_id = int(row[1]) - 1
        count = int(row[2])
        pubmed_dict_list[doc_id][pubmed_vocab[word_id]] = count

In [None]:
# # eksport przetworzonych dokumentów
# with open('./pickles/enron_dict_list.pickle', 'wb') as f:
#  pickle.dump(enron_dict_list, f)

# with open('./pickles/kos_dict_list.pickle', 'wb') as f:
#  pickle.dump(kos_dict_list, f)

# with open('./pickles/nips_dict_list', 'wb') as f:
#  pickle.dump(nips_dict_list, f)

# with open('./pickles/nytimes_dict_list.pickle', 'wb') as f:
#  pickle.dump(nytimes_dict_list, f)

# with open('./pickles/pubmed_dict_list.pickle', 'wb') as f:
#  pickle.dump(pubmed_dict_list, f)

In [109]:
random.seed(42)
docs = np.array([])
enron_indexes = random.sample(range(enron_D), 1500)
docs = np.r_[docs, enron_dict_list[enron_indexes]]
kos_indexes = random.sample(range(kos_D), 1500)
docs = np.r_[docs, kos_dict_list[kos_indexes]]
nips_indexes = random.sample(range(nips_D), 1500)
docs = np.r_[docs, nips_dict_list[nips_indexes]]
nytimes_indexes = random.sample(range(nytimes_D), 1500)
docs = np.r_[docs, nytimes_dict_list[nytimes_indexes]]
pubmed_indexes = random.sample(range(pubmed_D), 1500)
docs = np.r_[docs, pubmed_dict_list[pubmed_indexes]]

new_indexes = random.sample(range(len(docs)), len(docs))

docs_info = pd.DataFrame({"data_set": np.repeat(["enron", "kos", "nips", "nytimes", "pubmed"], 1500, axis = 0),
                          "doc_id": np.r_[enron_indexes, kos_indexes, nips_indexes, nytimes_indexes, pubmed_indexes]})

docs = docs[new_indexes]
docs_info = docs_info.iloc[new_indexes, :].reset_index(drop = True)
docs_info["doc_id"] = docs_info["doc_id"] + 1  # set original docID

enron_indexes = docs_info.loc[docs_info["data_set"] == "enron"].index
kos_indexes = docs_info.loc[docs_info["data_set"] == "kos"].index
nips_indexes = docs_info.loc[docs_info["data_set"] == "nips"].index
nytimes_indexes = docs_info.loc[docs_info["data_set"] == "nytimes"].index
pubmed_indexes = docs_info.loc[docs_info["data_set"] == "pubmed"].index

In [187]:
words_used = sorted(np.unique(list(chain.from_iterable([list(doc.keys()) for doc in docs]))))

bow = [[0 for _ in range(len(words_used))] for __ in range(7500)]
for ix, doc in enumerate(docs):
    for word, cnt in doc.items():
        bow[ix][words_used.index(word)] = cnt

In [195]:
# eksport utworzonych danych
with open('./pickles/indexes.pickle', 'wb') as f:
 pickle.dump((enron_indexes, kos_indexes, nips_indexes, nytimes_indexes, pubmed_indexes), f)

with open('./pickles/docs.pickle', 'wb') as f:
 pickle.dump((docs, docs_info), f)

with open('./pickles/words_used.pickle', 'wb') as f:
 pickle.dump(words_used, f)

with open('./pickles/bow.pickle', 'wb') as f:
 pickle.dump(bow, f)

## Analiza

In [196]:
# import danych
with open('./pickles/indexes.pickle', 'rb') as f:
    enron_indexes, kos_indexes, nips_indexes, nytimes_indexes, pubmed_indexes = pickle.load(f)
    
with open('./pickles/docs.pickle', 'rb') as f:
    docs, docs_info = pickle.load(f)
    
with open('./pickles/words_used.pickle', 'rb') as f:
    words_used, f = pickle.load(f)
    
with open('./pickles/bow.pickle', 'rb') as f:
    bow = pickle.load(f)