In [1]:
import pandas as pd
import numpy as np
import nltk
import pickle
from collections import defaultdict

In [2]:
def count_lines(filepath_list):
    count = 0
    for filepath in filepath_list:
        f = open(filepath, "r", encoding="utf-8")
        while 1:
            buffer = f.read(1000000)
            if not buffer:
                break
            count += buffer.count("\n")
        f.close()
    return count

In [3]:
class WordCounter(object):
    def __init__(self):
        self.tok = nltk.tokenize.toktok.ToktokTokenizer()
        self.words_list = []
        
    def __check_filepath_list(self, filepath_list):
        if filepath_list is None or not isinstance(filepath_list, list) or len(filepath_list) == 0:
            raise Exception("filepath list is empty or not a list")
    
    def fit(self, filepath_list, target_col):
        self.__check_filepath_list(filepath_list)
        self.target_col = target_col
        counter = defaultdict(lambda: 0)
        for filepath in filepath_list:
            data_df_iter = pd.read_csv(filepath, iterator=True, usecols=[self.target_col], chunksize=100000, encoding="utf-8")
            for chunk in data_df_iter:
                for content in chunk[target_col]:
                    words = [w.lower() for w in self.tok.tokenize(content)]
                    for word in words:
                        counter[word] += 1
        self.words_list = list(counter.items())
        self.words_list.sort(key=lambda x: -x[1])
    
    def most_common(self, vocab_size):
        return self.words_list[:vocab_size]
    
    def transform(self, filepath_list, max_words = None):
        self.__check_filepath_list(filepath_list)
        counts = [["unk", -1]]
        if max_words is None:
            max_words = len(self.words_list) + 1
        counts.extend(self.most_common(max_words - 1))
        dictionary = {}
        documents_indices = []
        for word, _ in counts:
            dictionary[word] = len(dictionary)
        for filepath in filepath_list:
            data_df_iter = pd.read_csv(filepath, iterator=True, usecols=[self.target_col], chunksize=100000, encoding="utf-8")
            for chunk in data_df_iter:
                for content in chunk[self.target_col]:
                    words = [w.lower() for w in self.tok.tokenize(content)]
                    word_indices = []
                    for word in words:
                        if word in dictionary:
                            index = dictionary[word]
                        else:
                            index = 0
                        word_indices.append(index)
                    documents_indices.append(word_indices)
        return documents_indices

In [None]:
wordCounter = WordCounter()
wordCounter.fit(["E:/kaggle/avito/preprocessing/train_descriptions.data", "E:/kaggle/avito/preprocessing/train_active_descriptions.data",
                "E:/kaggle/avito/preprocessing/test_descriptions.data", "E:/kaggle/avito/preprocessing/test_active_descriptions.data"], 
                "description")

In [53]:
pickle.dump(wordCounter.words_list, open("E:/kaggle/avito/preprocessing/words_counter_list", "wb"))

In [36]:
documents_indices = wordCounter.transform(["E:/kaggle/avito/preprocessing/train_descriptions.data"], max_words=10000)
pickle.dump(documents_indices, open("E:/kaggle/avito/preprocessing/indices_top10000_train_descriptions", "wb"))

documents_indices = wordCounter.transform(["E:/kaggle/avito/preprocessing/train_active_descriptions.data"], max_words=10000)
pickle.dump(documents_indices, open("E:/kaggle/avito/preprocessing/indices_top10000_train_active_descriptions", "wb"))

documents_indices = wordCounter.transform(["E:/kaggle/avito/preprocessing/test_descriptions.data"], max_words=10000)
pickle.dump(documents_indices, open("E:/kaggle/avito/preprocessing/indices_top10000_test_descriptions", "wb"))

documents_indices = wordCounter.transform(["E:/kaggle/avito/preprocessing/test_active_descriptions.data"], max_words=10000)
pickle.dump(documents_indices, open("E:/kaggle/avito/preprocessing/indices_top10000_test_active_descriptions", "wb"))

In [4]:
wordCounter = WordCounter()
wordCounter.words_list = pickle.load(open("E:/kaggle/avito/preprocessing/words_counter_list", "rb"))
wordCounter.target_col = "description"

In [15]:
words_list = list(map(lambda x: x[1], wordCounter.words_list))

In [34]:
words_list_histogram, hist_edges = np.histogram(words_list, 50000)

In [35]:
list(zip(hist_edges[1:], words_list_histogram, np.cumsum(words_list_histogram) / sum(words_list_histogram)))[:100]

[(1669.65718, 10197050, 0.996228380108552),
 (3338.31436, 15726, 0.997764774213277),
 (5006.97154, 6206, 0.9983710861688871),
 (6675.62872, 3454, 0.9987085340410555),
 (8344.285899999999, 2224, 0.998925813736395),
 (10012.94308, 1592, 0.9990813484823394),
 (11681.60026, 1184, 0.9991970225647504),
 (13350.25744, 903, 0.9992852435921297),
 (15018.91462, 724, 0.9993559767303607),
 (16687.571799999998, 640, 0.9994185032613936),
 (18356.22898, 465, 0.9994639326940973),
 (20024.88616, 415, 0.9995044772415639),
 (21693.54334, 370, 0.9995406253923174),
 (23362.20052, 299, 0.9995698370060343),
 (25030.8577, 259, 0.9995951407115617),
 (26699.51488, 257, 0.9996202490216796),
 (28368.172059999997, 195, 0.9996393000741037),
 (30036.82924, 189, 0.9996577649402993),
 (31705.486419999997, 169, 0.9996742758524002),
 (33374.143599999996, 169, 0.9996907867645012),
 (35042.80078, 145, 0.9997049529316883),
 (36711.45796, 136, 0.9997182398195328),
 (38380.11514, 109, 0.9997288888693493),
 (40048.77232, 129,

In [12]:
avito_train_df = pd.read_csv("E:/kaggle/avito/train.csv/train.csv", encoding="utf-8", usecols=["item_id", "description"])

In [15]:
avito_train_df[["item_id", "description"]].to_csv("E:/kaggle/avito/preprocessing/train_descriptions.data", index=False, encoding="utf-8")

In [2]:
avito_train_active_df = pd.read_csv("E:/kaggle/avito/train_active.csv/train_active.csv", encoding="utf-8", usecols=["item_id", "description"])

In [5]:
avito_train_active_df[["item_id", "description"]].to_csv("E:/kaggle/avito/preprocessing/train_active_descriptions.data", index=False, encoding="utf-8")

In [3]:
avito_test_df = pd.read_csv("E:/kaggle/avito/test.csv/test.csv", encoding="utf-8", usecols=["item_id", "description"])

In [5]:
avito_test_df[["item_id", "description"]].to_csv("E:/kaggle/avito/preprocessing/test_descriptions.data", index=False, encoding="utf-8")

In [6]:
avito_test_active_df = pd.read_csv("E:/kaggle/avito/test_active.csv/test_active.csv", encoding="utf-8", usecols=["item_id", "description"])

In [8]:
avito_test_active_df[["item_id", "description"]].to_csv("E:/kaggle/avito/preprocessing/test_active_descriptions.data", index=False, encoding="utf-8")

In [9]:
total_lines = count_lines(["E:/kaggle/avito/preprocessing/train_descriptions.data", "E:/kaggle/avito/preprocessing/train_active_descriptions.data",
                          "E:/kaggle/avito/preprocessing/test_descriptions.data", "E:/kaggle/avito/preprocessing/test_active_descriptions.data"])

In [10]:
total_lines

99167597