based on [this comment][1]

  [1]: https://www.kaggle.com/c/facebook-recruiting-iii-keyword-extraction/forums/t/6650/share-your-approach?forumMessageId=36434#post36434

In [1]:
import numpy as np
import pandas as pd
import IPython.display
from six.moves import cPickle as pickle
from tqdm import tqdm
tqdm.pandas()
from IPython.display import display


def maybe_pickle(file_name, load_dataset, force=False):
    pickle_file_name = "pickle/" + file_name + ".pickle"
    import os
    if not os.path.exists("pickle"):
        os.makedirs("pickle")
        
    if os.path.exists(pickle_file_name) and not force:
        # You may override by setting force=True.
        print('%s already present - Skipping pickling.' % pickle_file_name)
    else:
        print('Pickling %s.' % pickle_file_name)
        dataset = load_dataset(None)
        try:
            with open(pickle_file_name, 'wb') as f:
                pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
        except Exception as e:
            print('Unable to save data to', file_name, ':', e)
    
    return pickle_file_name

def load_data(file_name, force=False):
    original_file_path = "../input/" + file_name + ".csv"
    pickle_file_name = maybe_pickle(file_name, lambda x: pd.read_csv(original_file_path), force)
    
    with open(pickle_file_name, 'rb') as f:
        return pickle.load(f)

In [2]:
biology = load_data("biology")
cooking = load_data("cooking")
crypto = load_data("crypto")
diy = load_data("diy")
robotics = load_data("robotics")
travel = load_data("travel")

pickle/biology.pickle already present - Skipping pickling.
pickle/cooking.pickle already present - Skipping pickling.
pickle/crypto.pickle already present - Skipping pickling.
pickle/diy.pickle already present - Skipping pickling.
pickle/robotics.pickle already present - Skipping pickling.
pickle/travel.pickle already present - Skipping pickling.


In [3]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
stemmer = SnowballStemmer("english")
wordnet_lemmatizer = WordNetLemmatizer()
punctuation_trans_table = str.maketrans({key: None for key in string.punctuation})
html_tag_regex = re.compile('<.*?>')
code_tag_regex = re.compile('<code>([^<]+)</code>', re.S)
a_tag_regex = re.compile('<a href([^<]+)</a>', re.S)

def cleaning_text(text):
    original_text_length = len(text)
    number_of_html_tag = len(re.findall(html_tag_regex, text))
    number_of_code_fragments = len(re.findall(code_tag_regex, text))
    number_of_a_href = len(re.findall(a_tag_regex, text))
    
    # convert to lowercase
    text = text.lower()
    # remove code fragment
    text = re.sub(code_tag_regex, 'code_tag', text)
    # remove html tags
    text = re.sub(html_tag_regex, '', text)
    # remove \r, \n
    text = text.replace('\n', ' ').replace('\r', '')
    # remove Punctuations
    text = text.translate(punctuation_trans_table)
    # split
    words = word_tokenize(text)
    # remove stop words
    words = [word for word in words if word not in stopwords.words('english')]
    # lemmatizing, stemming
    words = [wordnet_lemmatizer.lemmatize(word) for word in words]
    words = [stemmer.stem(word) for word in words]
    # join
    text = ' '.join(words)
    
    number_of_cleaned_text_tokens = len(words)
    cleaned_text_length = len(text)
    return text, [original_text_length, number_of_html_tag, number_of_code_fragments, number_of_a_href, number_of_cleaned_text_tokens, cleaned_text_length]


def cleaning(row):
    row['title'], title_meta_list = cleaning_text(row['title'])
    row['title_original_text_length'] = title_meta_list[0]
    row['title_number_of_cleaned_text_tokens'] = title_meta_list[4]
    row['title_cleaned_text_length'] = title_meta_list[5]
    
    row['content'], content_meta_list = cleaning_text(row['content'])
    row['content_original_text_length'] = content_meta_list[0]
    row['content_number_of_html_tag'] = content_meta_list[1]
    row['content_number_of_code_fragments'] = content_meta_list[2]
    row['content_number_of_a_href'] = content_meta_list[3]
    row['content_number_of_cleaned_text_tokens'] = content_meta_list[4]
    row['content_cleaned_text_length'] = content_meta_list[5]
    return row

In [4]:
def load_cleaned_df(file_name, force=False):
    original_file_path = "../input/" + file_name + ".csv"
    df = pd.read_csv(original_file_path)
    print("total len : %d" % len(df))
    result_df = df.progress_apply(cleaning, axis=1)
    
    # feature scaling for meta columns
    result_df['title_original_text_length'] = min_max_scaler.fit_transform(result_df['title_original_text_length'])
    result_df['title_number_of_cleaned_text_tokens'] = min_max_scaler.fit_transform(result_df['title_number_of_cleaned_text_tokens'])
    result_df['title_cleaned_text_length'] = min_max_scaler.fit_transform(result_df['title_cleaned_text_length'])
    result_df['content_original_text_length'] = min_max_scaler.fit_transform(result_df['content_original_text_length'])
    result_df['content_number_of_html_tag'] = min_max_scaler.fit_transform(result_df['content_number_of_html_tag'])
    result_df['content_number_of_code_fragments'] = min_max_scaler.fit_transform(result_df['content_number_of_code_fragments'])
    result_df['content_number_of_a_href'] = min_max_scaler.fit_transform(result_df['content_number_of_a_href'])
    result_df['content_number_of_cleaned_text_tokens'] = min_max_scaler.fit_transform(result_df['content_number_of_cleaned_text_tokens'])
    result_df['content_cleaned_text_length'] = min_max_scaler.fit_transform(result_df['content_cleaned_text_length'])
    
    return result_df
    
def maybe_pickle_cleaned_df(file_name, force=False):
    pickle_file_name = maybe_pickle(file_name + "_cleaned", lambda x: load_cleaned_df(file_name), force)
    
    with open(pickle_file_name, 'rb') as f:
        return pickle.load(f)

In [5]:
biology_cleaned_df = maybe_pickle_cleaned_df('biology')
cooking_cleaned_df = maybe_pickle_cleaned_df('cooking')
crypto_cleaned_df = maybe_pickle_cleaned_df('crypto')
diy_cleaned_df = maybe_pickle_cleaned_df('diy')
robotics_cleaned_df = maybe_pickle_cleaned_df('robotics')
travel_cleaned_df = maybe_pickle_cleaned_df('travel')

pickle/biology_cleaned.pickle already present - Skipping pickling.
pickle/cooking_cleaned.pickle already present - Skipping pickling.
pickle/crypto_cleaned.pickle already present - Skipping pickling.
pickle/diy_cleaned.pickle already present - Skipping pickling.
pickle/robotics_cleaned.pickle already present - Skipping pickling.
pickle/travel_cleaned.pickle already present - Skipping pickling.


In [7]:
# extract most common tags
def extract_tags_count(cleaned_df):
    tags_list = cleaned_df['tags'].str.split(pat=' ').tolist()
    total_tags = pd.Series([item for sublist in tags_list for item in sublist])
    print("total tags count : %d" % len(total_tags))
    total_tags = pd.DataFrame(total_tags.value_counts(), columns=['count'])
    print("unique tags count : %d" % len(total_tags))
    display(total_tags.describe())
    display(total_tags.head())
    return total_tags


biology_total_tags = extract_tags_count(biology_cleaned_df)
cooking_total_tags = extract_tags_count(cooking_cleaned_df)
crypto_total_tags = extract_tags_count(crypto_cleaned_df)
diy_total_tags = extract_tags_count(diy_cleaned_df)
robotics_total_tags = extract_tags_count(robotics_cleaned_df)
travel_total_tags = extract_tags_count(travel_cleaned_df)

total tags count : 33129
unique tags count : 678


Unnamed: 0,count
count,678.0
mean,48.862832
std,126.580001
min,1.0
25%,5.0
50%,14.0
75%,38.0
max,1448.0


Unnamed: 0,count
human-biology,1448
genetics,1229
evolution,1159
biochemistry,984
molecular-biology,863


total tags count : 35542
unique tags count : 736


Unnamed: 0,count
count,736.0
mean,48.290761
std,106.684593
min,1.0
25%,7.0
50%,18.0
75%,43.0
max,1444.0


Unnamed: 0,count
baking,1444
food-safety,1211
substitutions,920
equipment,816
bread,687


total tags count : 25484
unique tags count : 392


Unnamed: 0,count
count,392.0
mean,65.010204
std,156.04103
min,1.0
25%,6.0
50%,18.5
75%,54.0
max,1783.0


Unnamed: 0,count
encryption,1783
hash,1141
rsa,1095
aes,923
public-key,842


total tags count : 59129
unique tags count : 734


Unnamed: 0,count
count,734.0
mean,80.557221
std,227.93661
min,1.0
25%,9.0
50%,25.0
75%,72.0
max,4490.0


Unnamed: 0,count
electrical,4490
plumbing,2223
wiring,1674
lighting,1003
hvac,922


total tags count : 6520
unique tags count : 231


Unnamed: 0,count
count,231.0
mean,28.225108
std,48.026908
min,1.0
25%,4.5
50%,11.0
75%,31.0
max,306.0


Unnamed: 0,count
quadcopter,306
mobile-robot,295
arduino,282
control,255
motor,239


total tags count : 65334
unique tags count : 1645


Unnamed: 0,count
count,1645.0
mean,39.716717
std,157.570955
min,1.0
25%,3.0
50%,7.0
75%,23.0
max,3829.0


Unnamed: 0,count
visas,3829
air-travel,2273
usa,2168
schengen,1561
uk,1492


In [8]:
# predict which category.
def create_category_added_df(df, category):
    temp_df = df.copy()
    temp_df['category'] = category
    return temp_df


full_df = pd.concat([create_category_added_df(biology_cleaned_df, 'biology'),
                    create_category_added_df(cooking_cleaned_df, 'cooking'),
                    create_category_added_df(crypto_cleaned_df, 'crypto'),
                    create_category_added_df(diy_cleaned_df, 'diy'),
                    create_category_added_df(robotics_cleaned_df, 'robotics'),
                    create_category_added_df(travel_cleaned_df, 'travel')]
                   )

print(len(full_df))
print(full_df.head())

87000
   id                                              title  \
0   1  what critic ribosom bind site relat start codo...   
1   2         how rnase contamin rna base experi prevent   
2   3               are lymphocyt size cluster two group   
3   4     how long antibioticdos lb maintain good select   
4   5                 is exon order alway preserv splice   

                                             content  \
0  in prokaryot translat critic effici translat l...   
1  doe anyon suggest prevent rnase contamin work ...   
2  tortora write principl anatomi physiolog lymph...   
3  various peopl lab prepar liter lb add kanamyci...   
4  are case splice machineri construct mrna exon ...   

                                                tags category  
0  ribosome binding-sites translation synthetic-b...  biology  
1                                   rna biochemistry  biology  
2                 immunology cell-biology hematology  biology  
3                                       

In [9]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

stop_words = text.ENGLISH_STOP_WORDS
full_df_vectorizer = TfidfVectorizer(stop_words=stop_words)
full_df_vectors = full_df_vectorizer.fit_transform((full_df['title'] + " " + full_df['content']).tolist())
print(len(full_df_vectorizer.get_feature_names()))

149242


In [10]:
# SGD classifier for predict category.
from sklearn.linear_model import SGDClassifier
category_classifier = SGDClassifier(loss="modified_huber")
X_train, X_test, y_train, y_test = train_test_split(full_df_vectors, full_df['category'], test_size=0.3, random_state=42)
category_classifier.fit(X_train, y_train)
category_classifier.score(X_test, y_test)

0.98038314176245211

In [11]:
def add_has_tag_columns(cleaned_df_input, total_tags_df, min_tag_apper_count, in_place=False):
    tag_split_df = cleaned_df_input
    if not in_place:
        tag_split_df = cleaned_df_input.copy()
        
    if not 'split_tag' in tag_split_df.columns:
        tag_split_df['split_tag'] = tag_split_df['tags'].str.split()
        
    tags = total_tags_df[total_tags_df['count'] >= min_tag_apper_count].index
    print("start")
    print("total tags : %d" % len(tags))
    for tag in tags:
        print("add tag : %s" % tag)
        tag_split_df["is_has_%s" % tag] = tag_split_df.apply(lambda row: tag in row['split_tag'], axis=1)
    print("finish")
    
    return tag_split_df

def maybe_pickle_has_tag_df(file_name, cleaned_df, total_tags_df, min_tag_apper_count, force=False):
    pickle_file_name = maybe_pickle(file_name + "_has_tag", lambda x: add_has_tag_columns(cleaned_df, total_tags_df, min_tag_apper_count), force)
    
    with open(pickle_file_name, 'rb') as f:
        return pickle.load(f)

In [12]:
biology_has_tag_df = maybe_pickle_has_tag_df('biology', biology_cleaned_df, biology_total_tags, 5)
cooking_has_tag_df = maybe_pickle_has_tag_df('cooking', cooking_cleaned_df, cooking_total_tags, 5)
crypto_has_tag_df = maybe_pickle_has_tag_df('crypto', crypto_cleaned_df, crypto_total_tags, 5)
diy_has_tag_df = maybe_pickle_has_tag_df('diy', diy_cleaned_df, diy_total_tags, 5)
robotics_has_tag_df = maybe_pickle_has_tag_df('robotics', robotics_cleaned_df, robotics_total_tags, 5)
travel_has_tag_df = maybe_pickle_has_tag_df('travel', travel_cleaned_df, travel_total_tags, 5)
print(len(biology_has_tag_df.columns))
print(len(cooking_has_tag_df.columns))
print(len(crypto_has_tag_df.columns))
print(len(diy_has_tag_df.columns))
print(len(robotics_has_tag_df.columns))
print(len(travel_has_tag_df.columns))

pickle/biology_has_tag.pickle already present - Skipping pickling.
pickle/cooking_has_tag.pickle already present - Skipping pickling.
pickle/crypto_has_tag.pickle already present - Skipping pickling.
pickle/diy_has_tag.pickle already present - Skipping pickling.
pickle/robotics_has_tag.pickle already present - Skipping pickling.
pickle/travel_has_tag.pickle already present - Skipping pickling.
541
627
319
634
178
1027


In [58]:
def generate_vectors(df):
    vectorizer = TfidfVectorizer(stop_words=text.ENGLISH_STOP_WORDS)
    vectors = vectorizer.fit_transform((df['title'] + " " + df['content']).tolist())
    return (vectorizer, vectors)


def maybe_pickle_vectors(file_name, df, force=False):
    pickle_file_name = maybe_pickle(file_name + "_vectors", lambda x: generate_vectors(df), force)
    
    with open(pickle_file_name, 'rb') as f:
        return pickle.load(f)

biology_vectorizer, biology_vectors = maybe_pickle_vectors("biology", biology_cleaned_df)
cooking_vectorizer, cooking_vectors = maybe_pickle_vectors("cooking", cooking_cleaned_df)
crypto_vectorizer, crypto_vectors = maybe_pickle_vectors("crypto", crypto_cleaned_df)
diy_vectorizer, diy_vectors = maybe_pickle_vectors("diy", diy_cleaned_df)
robotics_vectorizer, robotics_vectors = maybe_pickle_vectors("robotics", robotics_cleaned_df)
travel_vectorizer, travel_vectors = maybe_pickle_vectors("travel", travel_cleaned_df)
print(len(biology_vectorizer.get_feature_names()))
print(len(cooking_vectorizer.get_feature_names()))
print(len(crypto_vectorizer.get_feature_names()))
print(len(diy_vectorizer.get_feature_names()))
print(len(robotics_vectorizer.get_feature_names()))
print(len(travel_vectorizer.get_feature_names()))

pickle/biology_vectors.pickle already present - Skipping pickling.
pickle/cooking_vectors.pickle already present - Skipping pickling.
pickle/crypto_vectors.pickle already present - Skipping pickling.
pickle/diy_vectors.pickle already present - Skipping pickling.
pickle/robotics_vectors.pickle already present - Skipping pickling.
pickle/travel_vectors.pickle already present - Skipping pickling.
38497
23660
45476
36376
23038
33711


In [59]:
vectorizer_dict = {'biology':biology_vectorizer,
                  'cooking':cooking_vectorizer,
                  'crypto':crypto_vectorizer,
                  'diy':diy_vectorizer,
                  'robotics':robotics_vectorizer,
                  'travel':travel_vectorizer}

In [65]:
def create_tag_classifier(total_tags, vectors, full_df):
    classifier_map = {}
    
    total_count = 0
    total_accuracy = 0.0
    for tag in total_tags.index:
        tag_column = 'is_has_%s' % tag
        if tag_column in full_df.columns:
            X_train, X_test, y_train, y_test = train_test_split(vectors, 
                                                                full_df[tag_column].astype(float), 
                                                                test_size=0.3, 
                                                                random_state=42)
            clf = SGDClassifier(loss="modified_huber")
            clf.fit(X_train, y_train)
            total_count += 1
            total_accuracy += clf.score(X_test, y_test)
            classifier_map[tag] = clf
    print("used tag count : %d / %d" % (total_count, len(total_tags)))
    print("average accuracy : %f" % (total_accuracy / total_count))
    
    return classifier_map


def maybe_pickle_classifier(file_name, total_tags, vectors, full_df, force=False):
    pickle_file_name = maybe_pickle(file_name + "_classifier", lambda x: create_tag_classifier(total_tags, vectors, full_df), force)
    
    with open(pickle_file_name, 'rb') as f:
        return pickle.load(f)

# make SGD classifier tor predict tags. train with one-vs-rest approach
print("biology") 
biology_tags_classifier_dict = maybe_pickle_classifier('biology', biology_total_tags, biology_vectors, biology_has_tag_df, True)
print("cooking")
cooking_tags_classifier_dict = maybe_pickle_classifier('cooking', cooking_total_tags, cooking_vectors, cooking_has_tag_df, True)
print("crypto")
crypto_tags_classifier_dict = maybe_pickle_classifier('crypto', crypto_total_tags, crypto_vectors, crypto_has_tag_df, True)
print("diy")
diy_tags_classifier_dict = maybe_pickle_classifier('diy', diy_total_tags, diy_vectors, diy_has_tag_df, True)
print("robotics")
robotics_tags_classifier_dict = maybe_pickle_classifier('robotics', robotics_total_tags, robotics_vectors, robotics_has_tag_df, True)
print("travel")
travel_tags_classifier_dict = maybe_pickle_classifier('travel', travel_total_tags, travel_vectors, travel_has_tag_df, True)

biology
Pickling pickle/biology_classifier.pickle.
used tag count : 536 / 678
average accuracy : 0.995697
cooking
Pickling pickle/cooking_classifier.pickle.
used tag count : 622 / 736
average accuracy : 0.997094
crypto
Pickling pickle/crypto_classifier.pickle.
used tag count : 314 / 392
average accuracy : 0.993633
diy
Pickling pickle/diy_classifier.pickle.
used tag count : 629 / 734
average accuracy : 0.996921
robotics
Pickling pickle/robotics_classifier.pickle.
used tag count : 173 / 231
average accuracy : 0.987932
travel
Pickling pickle/travel_classifier.pickle.
used tag count : 1022 / 1645
average accuracy : 0.997430


In [69]:
classifier_dict = {'biology':biology_tags_classifier_dict,
                  'cooking':cooking_tags_classifier_dict,
                  'crypto':crypto_tags_classifier_dict,
                  'diy':diy_tags_classifier_dict,
                  'robotics':robotics_tags_classifier_dict,
                  'travel':travel_tags_classifier_dict}

In [21]:
# predict for test data
test = load_data("test")
test_cleaned_df = maybe_pickle_cleaned_df('test')

Pickling pickle/test.pickle.
Pickling pickle/test_cleaned.pickle.
total len : 81926


81927it [55:21, 24.66it/s]


In [22]:
test_df_vectors = full_df_vectorizer.transform((test_cleaned_df['title'] + " " + test_cleaned_df['content']).tolist())
test_category = category_classifier.predict(test_df_vectors)

In [23]:
print(test_category)

['diy' 'crypto' 'biology' ..., 'biology' 'biology' 'robotics']


In [24]:
test_cleaned_df['category'] = test_category

In [70]:
def predict_tags(test_cleand_df, category):
    target_df = test_cleaned_df[test_cleaned_df['category'] == category]
    print(target_df)
    vectorizer = vectorizer_dict[category]
    vectors = vectorizer.transform(target_df['title'] + ' ' + target_df['content'])
    print(vectors)
    classifier_dict_inner = classifier_dict[category]
    
    tag_confidence_df = pd.DataFrame()
    for classifier_name in classifier_dict_inner:
        classifier = classifier_dict_inner[classifier_name]
        confidence = classifier.predict(vectors)
        print(classifier_name, confidence)

predict_tags(test_cleaned_df[:100], 'robotics')

           id                                              title  \
11         26                   veloc object electromagnet field   
51        147  what use way imagin concept spin relat subatom...   
60        186                         find angular acceler torqu   
72        228               acceler ring aerotrim human gyroscop   
76        239                   what properti object allow float   
84        286               how angular veloc vector rotat arent   
94        326                      angular momentum averag torqu   
105       361                               symmetr twin paradox   
137       478        is possibl project magnet field locat space   
145       503  open problem special relat noninerti motion fl...   
147       507                         tire speed depend friction   
169       603          chose refer frame earth rest doesnt rotat   
187       679                                 learn physic onlin   
194       727      what condit fulfil path mass 