In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

from gensim.models.word2vec import Word2VecKeyedVectors
from gensim.models.poincare import PoincareModel
from gensim.models.word2vec import Word2Vec
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from collections import namedtuple
import itertools
import random
import sqlite3

## Model Training

In [2]:
sns.set_context("poster")
sns.set_style("ticks")

In [3]:
%%time
model_poincare = PoincareModel.load("D:/wiki_cat_poincare")
model_node2vec = Word2Vec.load("D:/wiki_category_node2vec")
wiki_elmo = Word2VecKeyedVectors.load("D:/wiki_cat_elmo.kv")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Wall time: 1min 46s


In [4]:
exc = {'Computing_by_natural_language', 'Cyberinfrastructure', 'Human_computers', 'Isotoxal_tilings', 'Musical_set_theory',
       'Tilings_by_order', 'Truncated_tilings', 'Bitruncated_tilings', 'Chinese-language_computing', 'Diatonic_set_theory',
       'Han_character_input', 'Indic_computing', 'Urdu_computing','Russian-language_computing', 'Transportation_engineering',
       'Mongolian-language_computing', 'Korean-language_computing','Prime_limits','Pitch_space','Traffic_simulation'}

In [5]:
def load_categories(filepath):
    with open(filepath, encoding="utf-8") as fp:
        categories = []
        for line in fp:
            cat, level = line.strip().split("\t")
            level = int(level)
            categories.append((cat, level))
    return categories

def load_edges(filepath):
    with open(filepath, encoding="utf-8") as fp:
        edges = []
        for line in fp:
            cat1, cat2, level = line.strip().split("\t")
            level = int(level)
            edges.append((cat1, cat2, level))
    return edges

VOCAB_TUPLE=namedtuple("CATEGORY_VOCAB", "name2id id2name size")
def load_vocab(filename):
    with open(filename, encoding="utf-8") as fp:
        # replace space with underscore for wikipedia category names
        name2id = {line.strip().replace(" ", "_"): str(i) for i, line in enumerate(fp)}
        id2name = {v: k for k,v in name2id.items()}
    vocab=VOCAB_TUPLE(name2id=name2id, id2name=id2name, size=len(name2id))
    return vocab

def load_data(filename):
    with open(filename, encoding="utf-8") as fp:
        # replace space with underscore for wikipedia category names
        name2id = {line.strip().replace(" ", "_"): str(i) for i, line in enumerate(fp)}
        id2name = {v: k for k,v in name2id.items()}
    return id2name

In [6]:
invalid_categories = load_categories("C:/Users/kanya/Desktop/kisti/CS_invalid_categories.txt")
invalid_edges = load_edges("C:/Users/kanya/Desktop/kisti/CS_invalid_edges.txt")
print((f"{len(invalid_categories)} invalid categories, {len(invalid_edges)} invalid edges")) 

13330 invalid categories, 14832 invalid edges


In [7]:
wiki_category_vocab = load_vocab("C:/Users/kanya/Desktop/kisti/categories.txt")
print((f"Found items in Wiki category vocab: name2id={len(wiki_category_vocab.name2id)}, "
      f"id2name={len(wiki_category_vocab.id2name)}"))

Found items in Wiki category vocab: name2id=1623862, id2name=1623862


In [8]:
all_cate = list(wiki_category_vocab[0])

In [9]:
def get_filtered_categories(edges, max_level=10):
    categories = dict()
    for e1, e2, level in edges:
        if level < max_level:
            for e in [e1, e2]:
                if e not in categories:
                    categories[e] = level
                categories[e] = min(level, categories[e])
    return [(k,v) for k,v in categories.items()]

In [10]:
result = pd.read_csv('C:/Users/kanya/Desktop/kisti/plusmath.txt.gz', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
ACM_CSO_MATH = result[result['topic'].notnull() & result['CSO_ACM_MATH'].notnull()].drop_duplicates(subset='topic')
ACM_CSO_MATH = ACM_CSO_MATH['topic'].tolist()

In [12]:
invalid_category_level_dict = dict(get_filtered_categories(invalid_edges, max_level=20))
filtered_invalid_categories = [(cat, invalid_category_level_dict.get(cat, -1)) for cat, lvl in invalid_categories]
print((f"filtered_invalid_categories={len(filtered_invalid_categories)}"))

filtered_invalid_categories=13330


In [13]:
random.seed(9001)
filtered_invalid_categories = random.sample(filtered_invalid_categories, len(ACM_CSO_MATH))

In [14]:
len(all_cate)

1623862

In [15]:
def get_categories(categories, vocab): 
    return [vocab.name2id[c] for c in categories]

def get_categories_no_root(categories): 
    return [c for c, l in categories if c != '<CS_CATEGORY_TREE_ROOT>']

def process_model_data(
    categories_list, 
    vocab, 
    keyed_vectors_list, 
    skip_vocab_indexes=None
):
    categories_list = [categories_list[0], get_categories_no_root(categories_list[1])]
    vectors_list = [[] for i in range(len(categories_list))]
    skip_vocab_indexes = skip_vocab_indexes if skip_vocab_indexes is not None else set()
    for i, kv in enumerate(keyed_vectors_list):
        keys_list = [categories for categories in categories_list]
        if i not in skip_vocab_indexes:
            keys_list = [
                np.array(get_categories(categories, vocab)) 
                for categories in keys_list
            ]
        for vectors, keys in zip(vectors_list, keys_list):
            vectors.append(kv[keys])
    vectors_list = [np.hstack(vectors) for vectors in vectors_list]
    return vectors_list

def generate_training_data(
    valid_categories, 
    invalid_categories, 
    vocab, 
    keyed_vectors_list, 
    skip_vocab_indexes=None
):
    valid_vectors, invalid_vectors = process_model_data(
        [valid_categories, invalid_categories],
        vocab, 
        keyed_vectors_list, 
        skip_vocab_indexes=skip_vocab_indexes
    )
    X = np.vstack([valid_vectors, invalid_vectors])
    y = np.array([1]*valid_vectors.shape[0] + [0]*invalid_vectors.shape[0])
    assert X.shape[0] == y.shape[0], f"X.shape={X.shape} != y.shape={y.shape}"
    return X, y
    
def shuffle_data(X, y):
    idx = np.random.permutation(X.shape[0])
    X = X[idx]
    y = y[idx]
    return X, y

def fit_model_factory(valid_categories, invalid_categories, vocab):
    def fit_model(model, keyed_vectors_list, skip_vocab_indexes=None):
        X, y = generate_training_data(
            valid_categories, 
            invalid_categories, 
            vocab, 
            keyed_vectors_list,
            skip_vocab_indexes=skip_vocab_indexes
        )
        X, y = shuffle_data(X, y)
        print(f"X={X.shape}, y={y.shape}")
        model.fit(X, y)
        return model
    return fit_model

def get_best_scores(model):
    idx = model.scores_[1].mean(axis=0).argmax()
    scores = model.scores_[1][idx]
    return scores

def get_best_scores_grid_cv(model):
    best_idx = model.best_index_
    scores = np.array([
        v[best_idx] 
        for k,v in model.cv_results_.items()
        if k.startswith("split") and k.endswith("_test_score")
    ])
    return scores

In [16]:
model_scores = {}
model_best_params = {}

In [17]:
KEYWORD_CONFIG=namedtuple("KEYWORD_CONFIG", "kv_type kv_list skip_vocab_idx")

### Balanced model

In [18]:
def process_model_data2(
    categories_list, 
    vocab, 
    keyed_vectors_list, 
    skip_vocab_indexes=None
):
    categories_list = [categories for categories in categories_list]
    vectors_list = [[] for i in range(len(categories_list))]
    skip_vocab_indexes = skip_vocab_indexes if skip_vocab_indexes is not None else set()
    for i, kv in enumerate(keyed_vectors_list):
        keys_list = [categories for categories in categories_list]
        if i not in skip_vocab_indexes:
            keys_list = [
                np.array(get_categories(categories, vocab)) 
                for categories in keys_list
            ]
        for vectors, keys in zip(vectors_list, keys_list):
            vectors.append(kv[keys])
    vectors_list = [np.hstack(vectors) for vectors in vectors_list]
    return vectors_list

In [19]:
def model_pred(categories, predictor):
    vectors = process_model_data2(
    [categories],
    wiki_category_vocab,
    best_config.kv_list, 
    skip_vocab_indexes=best_config.skip_vocab_idx
    )
    
    valid_pred = model.predict(vectors[0])
    y_pred = valid_pred
    return y_pred
    
    

def get_classification_report(pred):    
    print(confusion_matrix(pred[0], pred[1]))
    print(classification_report(pred[0], pred[1], target_names=["invalid", "valid"]))
    

def pd_result(categories, model):
    result = model_pred(categories, model)
    result = {"category": categories,
              "y_predict" : result}
    result = pd.DataFrame(result)
    return result

In [20]:
import json
with open("C:/Users/kanya/Desktop/kisti/kisti/best_params_balanced.json") as fp:
    model_best_params = json.load(fp)

In [21]:
KEYWORD_CONFIG=namedtuple("KEYWORD_CONFIG", "kv_type kv_list skip_vocab_idx")
best_config = KEYWORD_CONFIG("elmo_node2vec", [wiki_elmo, model_node2vec], skip_vocab_idx={0})

best_model_type = "mlp"
best_model_key = f"{best_config.kv_type}_{best_model_type}"

In [22]:
invalid_categories = load_categories("C:/Users/kanya/Desktop/kisti/CS_invalid_categories.txt")
invalid_edges = load_edges("C:/Users/kanya/Desktop/kisti/CS_invalid_edges.txt")
wiki_category_vocab = load_vocab("C:/Users/kanya/Desktop/kisti/categories.txt")
invalid_category_level_dict = dict(get_filtered_categories(invalid_edges, max_level=20))
filtered_invalid_categories = [(cat, invalid_category_level_dict.get(cat, -1)) for cat, lvl in invalid_categories]
random.seed(9001)
filtered_invalid_categories = random.sample(filtered_invalid_categories, len(ACM_CSO_MATH))
model = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(64), random_state=20)
fit_model = fit_model_factory(ACM_CSO_MATH, filtered_invalid_categories, wiki_category_vocab)
model = fit_model(model, best_config.kv_list, skip_vocab_indexes=best_config.skip_vocab_idx)



X=(3512, 2348), y=(3512,)


## load wiki category tree

In [23]:
all_cate = pd.read_csv('C:/Users/kanya/Desktop/kisti/all_category_edges.tsv.gz', sep = '\t')

In [24]:
print('wikipedia category tree')
print(len(set(all_cate['child_cat'].tolist())))

wikipedia category tree
1623843


## BFS

In [25]:
subcate = ['Computer_science',
           'Mathematics',
           'Information_science',
           'Computer_engineering',
           'Statistics']


result= pd.DataFrame({'category': list(subcate),
                      'level': 1})
result1 = result
for i in range(19):
    relation = all_cate[all_cate['parent_cat'].isin(subcate)]
    subcate = set(relation['child_cat']) - set(result1['category'])
    subcate_data = pd.DataFrame({'category': list(subcate),
                                 'level':i+2})
    result1 = pd.concat([result1, subcate_data])

In [26]:
print('ICS after BFS')
print(len(result1))

ICS after BFS
1422160


## Manual filtering

In [27]:
top3 = pd.read_csv('C:/Users/kanya/Desktop/kisti/manual_filter_categories.txt', sep = '\t', header = None, names = ['cate'])['cate'].tolist()

In [28]:
subcate = ['Computer_science',
           'Mathematics',
           'Information_science',
           'Computer_engineering',
           'Statistics']


result= pd.DataFrame({'category': list(subcate),
                      'level': 1})
relation = all_cate[all_cate['parent_cat'].isin(subcate)]
relation = relation[relation['child_cat'].isin(top3)]
subcate = set(relation['child_cat']) - set(result['category'])
subcate_data = pd.DataFrame({'category': list(subcate),
                             'level':2})
result = pd.concat([result, subcate_data])
relation = all_cate[all_cate['parent_cat'].isin(subcate)]
relation = relation[relation['child_cat'].isin(top3)]
subcate = set(relation['child_cat']) - set(result['category'])
subcate_data = pd.DataFrame({'category': list(subcate),
                             'level':3})
result = pd.concat([result, subcate_data])

result1 = result
for i in range(17):
    relation = all_cate[all_cate['parent_cat'].isin(subcate)]
    subcate = set(relation['child_cat']) - set(result1['category'])
    subcate_data = pd.DataFrame({'category': list(subcate),
                                 'level':i+4})
    result1 = pd.concat([result1, subcate_data])

In [29]:
print('ICS after manual filtering')
print(len(result1))

ICS after manual filtering
1420711


## CD and ML

In [30]:
aaa = 'aaaa'

bbb = 'aaaa'

ccc = 'aaaa'
    
invadd = set(['Language', 'Time'])
    
subcate = ['Computer_science',
           'Mathematics',
           'Information_science',
           'Computer_engineering',
           'Statistics']

result2= pd.DataFrame({'category': list(subcate),
                      'level': 1})
relation = all_cate[all_cate['parent_cat'].isin(subcate)]
relation['tf'] = relation['child_cat'].str.contains(aaa, case = False)
relation = relation[relation['tf'] == 0]
subcate = set(relation['child_cat'])
subcate_data = pd.DataFrame({'category': list(subcate),
                             'level':2})
result2 = pd.concat([result2, subcate_data])

relation = all_cate[all_cate['parent_cat'].isin(subcate)]
relation['tf'] = relation['child_cat'].str.contains(aaa, case = False)
relation = relation[relation['tf'] == 0]
subcate = set(relation['child_cat'])
subcate_data = pd.DataFrame({'category': list(subcate),
                             'level':3})
result2 = pd.concat([result2, subcate_data])




for i in range(17):
    subset = list(set(all_cate[all_cate['parent_cat'].isin(subcate)]['child_cat']))
    predict = pd_result(subset, model)
    
    predict['tf'] = predict['category'].str.contains(bbb, case = False)
    index = predict[predict['tf'].isin([1]) & predict['y_predict'].isin([0])]['y_predict'].index
    predict['y_predict'][index] = 1
    if i < 5:
        predict['tf'] = predict['category'].str.contains(aaa, case = False)
    else:
        predict['tf'] = predict['category'].str.contains(ccc, case = False)
    index = predict[predict['tf'].isin([1]) & predict['y_predict'].isin([1])]['y_predict'].index
    predict['y_predict'][index] = 0
    
    subcate = set(predict[predict['y_predict']==1]['category']) - set(result2['category'])
    subcate = subcate - invadd
    invalid = set(predict[predict['y_predict']==0]['category'])
    
    
    subcate_data = pd.DataFrame({'category': list(subcate),
                                 'level':i+4,
                                 'valid':'valid'})
    invalid_data = pd.DataFrame({'category':list(invalid),
                                 'level':i+4,
                                 'valid':'invalid'})
    result2 = pd.concat([result2, subcate_data, invalid_data])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

In [31]:
result = result2
result.index = range(len(result))
remove = pd.read_csv('C:/Users/kanya/Desktop/kisti/netresult.txt', sep = '\t')
acc = pd.concat([result[result['valid'].isnull()], result[result['valid']=='valid']], 0)
remove = set(acc['category'].tolist()).intersection(set(remove['cate'].tolist()))
remove = remove - exc
acc.index = range(len(acc))
ind = []
for i in range(len(acc)):
    if acc['level'][i] >3:
        if acc['category'][i] in remove:
            a = 0
        else:
            ind.append(acc.index[i])
    else:
        ind.append(acc.index[i])
finall = acc[acc['category'].index.isin(ind)]

In [32]:
print('ICS after CD and ML')
print(len(finall))

ICS after CD and ML
11275


In [33]:
invalid_cate = [
    'microsoft',
    'windows',
    'camera',
    'linux',
    'Plot_(narrative)',
    'macos',
    'Human_communication',
    'studios',
    'Quantum_mechanics',
    'Medical_Subject_Headings',
    '_ios',
    'ios_'
    'analyst',
    'Biogeography',
    'Geographic',
    'device',
    'Video_game_gameplay',
    'google',
    'Samsung',
    'companies',
    'journal',
    'conference',
    'Philosophy_of_mathematics',
    'Yahoo',
    'libraries',
    'Vehicle_technology',
    'Health_standards',
    'Arabic',
    'novels',
    'literature',
    'television',
    'website',
    'award',
    'researcher',
    'station',
    'amazon',
    'android',
    'twitter',
    'albums',
    r"\d{4}",
    'century',
    'company',
    'article',
    'university',
    'Classification_systems',
    'games',
    'book',
    'countries',
    'country',
    'certifications',
    'school',
    'Mathematicians',
    'Atomic_physics',
    'Statisticians',
    'profession',
    'organizations',
    'Computer_science_education',
    'Computer_science_literature',
    'year',
    'engineers',
    'Nintendo',
    'logos',
    'covers',
    'screenshot',
    'Goods',
    'Utilitarianism',
    'by_platform',
    'Early_modern_printing_databases',
    'Signage',
    'Time_series_software',
    'documents',
    'texts',
    'education',
    'day',
    'month',
    'criticism',
    'works',
    'United_States',
    'Skype',
    'videos',
    'Econometrics_software',
    'Bibliography',
    'Grouping',
    'Manuscripts',
    'Corpora',
    'Emergence',
    'Virtual_pets',
    'Self-organization',
    'Systems_science',
    'Astronomical_databases',
    'Biorepositories',
    'Omics',
    'Phylogenetics',
    'Formal_sciences',
    'chemists',
    'nationality',
    'Demography',
    'fiction',
    'Quantity',
    'homeostasis',
    'Wikipedia_categories_named_after_software',
    'Semiotics',
    'Strategy',
    'occupations',
    'Heating',
    'artists',
    'educators',
    'writers',
    'Gases',
    'Astronomical',
    'Power_control',
    'Fire_suppression',
    'fellows',
    'manufacturing',
    'reference',
    'Hand_tools',
    'HP_',
    'podcast',
    'CNN',
    'die',
    'Coolants',
    'Electrodynamics',
    'thermodynamics',
    'designers',
    'Nutritional_advice_pyramids',
    'Waves',
    'publications',
    'music',
    'office',
    'free_',
    'freeware',
    'Macintosh',
    'brand',
    'scientists',
    'theorists',
    'Vortices',
    'Lists',
    'Astronomy',
    '_biology'
    'Symbols',
    'Social_statistics'
    'Musical_tuning',
    'Ecoregions',
    'films',
    'Statisticians',
    'Climate_change',
    'Ecological_connectivity',
    'Book_Number',
    'Population_genetics',
    'Cartography',
    'Archives',
    'Museums',
    'Librarian',
    'Labs'
]

invadd = set(['Language', 'Time'])

valid_cate = [
    'computational',
    'Theorem',
    'computer_science',
    'theory',
    'language',
    'algorithm',
    'computing',
    'programming_languages',
]


exc = {'Computing_by_natural_language', 'Cyberinfrastructure', 'Human_computers', 'Isotoxal_tilings', 'Musical_set_theory',
       'Tilings_by_order', 'Truncated_tilings', 'Bitruncated_tilings', 'Chinese-language_computing', 'Diatonic_set_theory',
       'Han_character_input', 'Indic_computing', 'Urdu_computing','Russian-language_computing', 'Transportation_engineering',
       'Mongolian-language_computing', 'Korean-language_computing','Prime_limits','Pitch_space','Traffic_simulation'}

aaa = invalid_cate[0]
for i in range(len(invalid_cate)):
    aaa = aaa+ '|' +invalid_cate[i]

bbb = valid_cate[0]
for i in range(len(valid_cate)):
    bbb = bbb+ '|' +valid_cate[i]
    
    
invalid_cate1 = [
    'microsoft',
    'windows',
    'camera',
    'linux',
    'Plot_(narrative)',
    'macos',
    'Human_communication',
    'studios',
    '_ios',
    'ios_',
    'designers',
    'analyst',
    'Biogeography',
    'HP_',
    'Geographic',
    'device',
    'Skype',
    'Video_game_gameplay',
    'google',
    'Samsung',
    'companies',
    'Macintosh',
    'office',
    'music',
    'journal',
    'conference',
    'Philosophy_of_mathematics',
    'Yahoo',
    'libraries',
    'Arabic',
    'CNN',
    'novels',
    'literature',
    'television',
    'website',
    'award',
    'researcher',
    'station',
    'amazon',
    'android',
    'twitter',
    'albums',
    r"\d{4}",
    'century',
    'company',
    'article',
    'university',
    'games',
    'book',
    'countries',
    'freeware',
    'country',
    'certifications',
    'school',
    'Mathematicians',
    'Statisticians',
    'profession',
    'organizations',
    'Computer_science_education',
    'Computer_science_literature',
    'year',
    'engineers',
    'logos',
    'covers',
    'podcasts',
    'screenshot',
    'day',
    'month',
    'die',
    'free_'
    'Goods',
    'Utilitarianism',
    'by_platform',
    'Early_modern_printing_databases',
    'Signage',
    'Time_series_software',
    'documents',
    'texts',
    'education',
    'works',
    'United_States',
    'Nintendo',
    'Econometrics_software',
    'Bibliography',
    'Grouping',
    'Manuscripts',
    'Corpora',
    'Emergence',
    'Virtual_pets',
    'Self-organization',
    'Astronomical_databases',
    'Biorepositories',
    'Omics',
    'Labs',
    'Phylogenetics',
    'chemists',
    'nationality',
    'fiction',
    'Quantity',
    'homeostasis',
    'Wikipedia_categories_named_after_software',
    'Semiotics',
    'Strategy',
    'occupations',
    'Heating',
    'artists',
    'educators',
    'writers',
    'Gases',
    'Power_control',
    'Fire_suppression',
    'fellows',
    'manufacturing',
    'reference',
    'Hand_tools',
    'Coolants',
    'thermodynamics',
    'Nutritional_advice_pyramids',
    'Waves',
    'publications',
    'brand',
    'scientists',
    'theorists',
    'criticism',
    'Vortices',
    'Lists',
    'Symbols',
    'Social_statistics'
    'Musical_tuning',
    'Ecoregions',
    'films',
    'Statisticians',
    'Climate_change',
    'Ecological_connectivity',
    'Book_Number',
    'Population_genetics',
    'Cartography',
    'Archives',
    'Museums',
    'Librarian'
]


ccc = invalid_cate1[0]
for i in range(len(invalid_cate1)):
    ccc = ccc+ '|' +invalid_cate1[i]

In [34]:
subcate = ['Computer_science',
           'Mathematics',
           'Information_science',
           'Computer_engineering',
           'Statistics']

result= pd.DataFrame({'category': list(subcate),
                      'level': 1})
relation = all_cate[all_cate['parent_cat'].isin(subcate)]
relation = relation[relation['child_cat'].isin(top3)]
subcate = set(relation['child_cat']) - set(result['category'])
subcate_data = pd.DataFrame({'category': list(subcate),
                             'level':2})
result = pd.concat([result, subcate_data])
relation = all_cate[all_cate['parent_cat'].isin(subcate)]
relation = relation[relation['child_cat'].isin(top3)]
subcate = set(relation['child_cat']) - set(result['category'])
subcate_data = pd.DataFrame({'category': list(subcate),
                             'level':3})
result = pd.concat([result, subcate_data])

In [35]:
for i in range(17):
    subset = list(set(all_cate[all_cate['parent_cat'].isin(subcate)]['child_cat']))
    predict = pd_result(subset, model)
    
    predict['tf'] = predict['category'].str.contains(bbb, case = False)
    index = predict[predict['tf'].isin([1]) & predict['y_predict'].isin([0])]['y_predict'].index
    predict['y_predict'][index] = 1
    if i < 5:
        predict['tf'] = predict['category'].str.contains(aaa, case = False)
    else:
        predict['tf'] = predict['category'].str.contains(ccc, case = False)
    index = predict[predict['tf'].isin([1]) & predict['y_predict'].isin([1])]['y_predict'].index
    predict['y_predict'][index] = 0
    
    subcate = set(predict[predict['y_predict']==1]['category']) - set(result['category'])
    subcate = subcate - invadd
    invalid = set(predict[predict['y_predict']==0]['category'])
    
    
    subcate_data = pd.DataFrame({'category': list(subcate),
                                 'level':i+4,
                                 'valid':'valid'})
    invalid_data = pd.DataFrame({'category':list(invalid),
                                 'level':i+4,
                                 'valid':'invalid'})
    result = pd.concat([result, subcate_data, invalid_data])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be s

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame


In [36]:
result.index = range(len(result))
remove = pd.read_csv('C:/Users/kanya/Desktop/kisti/netresult.txt', sep = '\t')
acc = pd.concat([result[result['valid'].isnull()], result[result['valid']=='valid']], 0)
remove = set(acc['category'].tolist()).intersection(set(remove['cate'].tolist()))
remove = remove - exc
acc.index = range(len(acc))
ind = []
for i in range(len(acc)):
    if acc['level'][i] >3:
        if acc['category'][i] in remove:
            a = 0
        else:
            ind.append(acc.index[i])
    else:
        ind.append(acc.index[i])
finall = acc[acc['category'].index.isin(ind)]

In [37]:
print('Final categories')
print(len(finall))

Final categories
7354


## Evaluation

In [32]:
eva = pd.read_csv('C:/Users/kanya/Desktop/NA/level4.csv')

In [227]:
evaa = pd.merge(eva, finall, how = 'left')
evaa = evaa.fillna(0).replace('valid', 1)
evaa = evaa[evaa['level']==4]
print(classification_report(evaa['result'], evaa['valid']))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92       254
           1       0.85      0.88      0.87       146

    accuracy                           0.90       400
   macro avg       0.89      0.90      0.89       400
weighted avg       0.90      0.90      0.90       400



In [228]:
evaa = pd.merge(eva, finall, how = 'left')
evaa = evaa.fillna(0).replace('valid', 1)
evaa = evaa[evaa['level']==5]
print(classification_report(evaa['result'], evaa['valid']))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       333
           1       0.87      0.79      0.83        67

    accuracy                           0.94       400
   macro avg       0.91      0.88      0.90       400
weighted avg       0.94      0.94      0.94       400



In [229]:
evaa1 = pd.merge(eva, result[result['valid']!='invalid'], how = 'left')
evaa1 = evaa1.fillna(0).replace('valid', 1)

evaa1 = evaa1[evaa1['level']==4]
print(classification_report(evaa1['result'], evaa1['valid']))

              precision    recall  f1-score   support

           0       0.94      0.91      0.92       254
           1       0.85      0.90      0.87       146

    accuracy                           0.91       400
   macro avg       0.89      0.90      0.90       400
weighted avg       0.91      0.91      0.91       400



In [230]:
evaa1 = pd.merge(eva, result[result['valid']!='invalid'], how = 'left')
evaa1 = evaa1.fillna(0).replace('valid', 1)

evaa1 = evaa1[evaa1['level']==5]
print(classification_report(evaa1['result'], evaa1['valid']))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       333
           1       0.87      0.79      0.83        67

    accuracy                           0.94       400
   macro avg       0.91      0.88      0.90       400
weighted avg       0.94      0.94      0.94       400



In [231]:
evaa2 = pd.merge(eva, result2[result2['valid']!='invalid'], how = 'left')
evaa2 = evaa2.fillna(0).replace('valid', 1)
evaa2 = evaa2[evaa2['level']==4]
print(classification_report(evaa2['result'], evaa2['valid']))

              precision    recall  f1-score   support

           0       0.91      0.72      0.80       254
           1       0.64      0.88      0.74       146

    accuracy                           0.78       400
   macro avg       0.78      0.80      0.77       400
weighted avg       0.81      0.78      0.78       400



In [232]:
evaa2 = pd.merge(eva, result2[result2['valid']!='invalid'], how = 'left')
evaa2 = evaa2.fillna(0).replace('valid', 1)
evaa2 = evaa2[evaa2['level']==5]
print(classification_report(evaa2['result'], evaa2['valid']))

              precision    recall  f1-score   support

           0       0.98      0.91      0.94       333
           1       0.66      0.90      0.76        67

    accuracy                           0.91       400
   macro avg       0.82      0.90      0.85       400
weighted avg       0.92      0.91      0.91       400

