# data mixing & preprocessing

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

### load & split data

In [2]:
gen_target = pd.read_csv('bbc_target.csv', header=None)[0].append(pd.Series(data=fetch_20newsgroups(subset='all').target), ignore_index=True)
gen_text = pd.read_csv('bbc_text.csv', header=None)[0].append(pd.Series(data=fetch_20newsgroups(subset='all', remove=('headers', 'footers')).data), ignore_index=True)
gen_target = gen_target.append(pd.read_csv('webhouse_target.csv', header=None)[0])
gen_text = gen_text.append(pd.read_csv('webhouse_text.csv', header=None)[0])

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [3]:
gen_target.shape, gen_text.shape

((111682,), (111682,))

Current topic labels:

*  'alt.atheism' - 0,0
*  'comp.graphics' - 1,1
*  'comp.os.ms-windows.misc' - 2,1
*  'comp.sys.ibm.pc.hardware' - 3,1
*  'comp.sys.mac.hardware' - 4,1
*  'comp.windows.x' - 5,1
*  'misc.forsale' - 6,1
*  'rec.autos' - 7,0
*  'rec.motorcycles' - 8,0
*  'rec.sport.baseball' - 9.0
*  'rec.sport.hockey' - 10,0
*  'sci.crypt' - 11,1
*  'sci.electronics' - 12,1
*  'sci.med' - 13,1
*  'sci.space' - 14,1
*  'soc.religion.christian' - 15,0
*  'talk.politics.guns' - 16,0
*  'talk.politics.mideast' - 17,0
*  'talk.politics.misc' - 18,0
*  'talk.religion.misc' - 19,0


*  'business' - 20,1
*  'entertainment' - 21, 0
*  'politics' - 22,0
*  'sport' - 23,0
*  'tech'  - 24,1
*  'finance' - 25,1
*  'travel' - 26,0

Get counts for categories:

In [4]:
gen_target.replace({"'business'":20, "'entertainment'":21, "'politics'":22, "'sport'":23, "'tech'":24, "'finance'":25, "'travel'":26, "'sports'":23 }, inplace=True)

unique, counts = np.unique(gen_target, return_counts=True)
dict(zip(unique, counts))

In [6]:
x_train, x_test, y_train, y_test = train_test_split(gen_text, gen_target, stratify=gen_target)

### transform data
transform x

In [None]:
# tfidf = TfidfVectorizer(sublinear_tf=True, norm='l2', stop_words='english', min_df=20, max_df=100000, ngram_range=(1, 2))
tfidf = TfidfVectorizer(stop_words='english', norm=None, min_df=0.0003, ngram_range=(1, 2))

features = tfidf.fit_transform(x_train)
vectors_test = tfidf.transform(x_test)

In [None]:
features.shape

transform y

In [None]:
tech = [1,2,3,4,5,6,11,12,13,14,20,24,25]

bin_tr_y = np.isin(y_train.values, tech).astype(np.uint8)
bin_tes_y = np.isin(y_test.values, tech).astype(np.uint8)

unique, counts = np.unique(bin_tr_y, return_counts=True)
dict(zip(unique, counts))

train smth easy

In [None]:
clf = MultinomialNB(alpha=.01)
clf.fit(features, bin_tr_y)
pred = clf.predict(vectors_test)
accuracy_score(bin_tes_y, pred)

0.8839941262848752

In [None]:
# save transformed data
np.savetxt('train_test_data/vectors_test', vectors_test, del=',')
np.savetxt('train_test_data/features', features, del=',')
np.savetxt('train_test_data/bin_train_y', bin_train_y, del=',')
np.savetxt('train_test_data/bin_tes_y', bin_tes_y, del=',')

# save plain data
x_train.to_csv('train_test_data/x_train')
x_test.to_csv('train_test_data/x_test')
y_train.to_csv('train_test_data/y_train')
y_test.to_csv('train_test_data/y_test')

### Save data for js tf-idf vectorization

In [None]:
import json

In [None]:
def default(o):
    if isinstance(o, np.int64): return int(o)  
    if isinstance(o, np.float64): return float(o)
    raise TypeError

In [None]:
with open('tfidf_vocab.json', 'w') as f:
    json.dump(tfidf.vocabulary_, f, default=default)

In [None]:
with open('idf_.json', 'w') as f:
    json.dump(tfidf.idf_.tolist(), f)

In [1]:
# with open('transformed.json', 'w') as f:
#     json.dump(transformed.toarray().tolist(), f)