based on [this comment][1]

  [1]: https://www.kaggle.com/c/facebook-recruiting-iii-keyword-extraction/forums/t/6650/share-your-approach?forumMessageId=36434#post36434

In [1]:
import numpy as np
import pandas as pd
import IPython.display
from six.moves import cPickle as pickle
from tqdm import tqdm
tqdm.pandas()
from IPython.display import display


def maybe_pickle(file_name, load_dataset, force=False):
    pickle_file_name = "pickle/" + file_name + ".pickle"
    import os
    if not os.path.exists("pickle"):
        os.makedirs("pickle")
        
    if os.path.exists(pickle_file_name) and not force:
        # You may override by setting force=True.
        print('%s already present - Skipping pickling.' % pickle_file_name)
    else:
        print('Pickling %s.' % pickle_file_name)
        dataset = load_dataset(None)
        try:
            with open(pickle_file_name, 'wb') as f:
                pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
        except Exception as e:
            print('Unable to save data to', file_name, ':', e)
    
    return pickle_file_name

def load_data(file_name, force=False):
    original_file_path = "../input/" + file_name + ".csv"
    pickle_file_name = maybe_pickle(file_name, lambda x: pd.read_csv(original_file_path), force)
    
    with open(pickle_file_name, 'rb') as f:
        return pickle.load(f)

In [2]:
biology = load_data("biology")
cooking = load_data("cooking")
crypto = load_data("crypto")
diy = load_data("diy")
robotics = load_data("robotics")
travel = load_data("travel")

pickle/biology.pickle already present - Skipping pickling.
pickle/cooking.pickle already present - Skipping pickling.
pickle/crypto.pickle already present - Skipping pickling.
pickle/diy.pickle already present - Skipping pickling.
pickle/robotics.pickle already present - Skipping pickling.
pickle/travel.pickle already present - Skipping pickling.


In [3]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
stemmer = SnowballStemmer("english")
wordnet_lemmatizer = WordNetLemmatizer()
punctuation_trans_table = str.maketrans({key: None for key in string.punctuation})
html_tag_regex = re.compile('<.*?>')

def cleaning_text(text):
    # TODO remove code fragment
    # TODO remove url
    # TODO convert to lowercase
    # TODO add meta features from original text
    ## length of the raw text in chars
    ## number of code segments
    ## number of 'a href' tags
    ## number of times 'http' occurs (count urls)
    ## number of times 'grater sign' occurs (count html tags)
    # TODO add meta features from cleaned text
    ## number of words(tokens) in the clean text
    ## length of the clean text in chars
    # TODO feature scaling(0-1 range) with min-max
    
    # remove html tags
    text = re.sub(html_tag_regex, '', text)
    # remove \r, \n
    text = text.replace('\n', ' ').replace('\r', '')
    # remove Punctuations
    text = text.translate(punctuation_trans_table)
    # split
    words = word_tokenize(text)
    # remove stop words
    words = [word for word in words if word not in stopwords.words('english')]
    # lemmatizing, stemming
    words = [wordnet_lemmatizer.lemmatize(word) for word in words]
    words = [stemmer.stem(word) for word in words]
    # join
    text = ' '.join(words)
    return text


def cleaning(row):
    row['title'] = cleaning_text(row['title'])
    row['content'] = cleaning_text(row['content'])
    return row

In [4]:
def load_cleaned_df(file_name, force=False):
    original_file_path = "../input/" + file_name + ".csv"
    df = pd.read_csv(original_file_path)
    print("total len : %d" % len(df))
    return df.progress_apply(cleaning, axis=1)
    
def maybe_pickle_cleaned_df(file_name, force=False):
    pickle_file_name = maybe_pickle(file_name + "_cleaned", lambda x: load_cleaned_df(file_name), force)
    
    with open(pickle_file_name, 'rb') as f:
        return pickle.load(f)

In [5]:
biology_cleaned_df = maybe_pickle_cleaned_df('biology')
cooking_cleaned_df = maybe_pickle_cleaned_df('cooking')
crypto_cleaned_df = maybe_pickle_cleaned_df('crypto')
diy_cleaned_df = maybe_pickle_cleaned_df('diy')
robotics_cleaned_df = maybe_pickle_cleaned_df('robotics')
travel_cleaned_df = maybe_pickle_cleaned_df('travel')

pickle/biology_cleaned.pickle already present - Skipping pickling.
pickle/cooking_cleaned.pickle already present - Skipping pickling.
pickle/crypto_cleaned.pickle already present - Skipping pickling.
pickle/diy_cleaned.pickle already present - Skipping pickling.
pickle/robotics_cleaned.pickle already present - Skipping pickling.
pickle/travel_cleaned.pickle already present - Skipping pickling.


In [6]:
from sklearn.feature_extraction import text
stop_words = text.ENGLISH_STOP_WORDS

from sklearn.feature_extraction.text import TfidfVectorizer
biology_vectorizer = TfidfVectorizer(stop_words=stop_words)
biology_content_vectors = biology_vectorizer.fit_transform(biology_cleaned_df['content'].tolist())
print(biology_content_vectors)

  (0, 28049)	0.201563513586
  (0, 34574)	0.545545939178
  (0, 10235)	0.216224552536
  (0, 12532)	0.173535276741
  (0, 21244)	0.161736628034
  (0, 29886)	0.201563513586
  (0, 6780)	0.156742573526
  (0, 31584)	0.148569968154
  (0, 29433)	0.124191996023
  (0, 32441)	0.267732578259
  (0, 9339)	0.195943328919
  (0, 18675)	0.195261252058
  (0, 33104)	0.155603571764
  (0, 3445)	0.32400093874
  (0, 6090)	0.348485775607
  (0, 6369)	0.133603543064
  (0, 25023)	0.157488359995
  (0, 12523)	0.120612338712
  (1, 11850)	0.127145493515
  (1, 5420)	0.13836070344
  (1, 32960)	0.149749344724
  (1, 27866)	0.17474823558
  (1, 30009)	0.536496325093
  (1, 9805)	0.221790913884
  (1, 36842)	0.123449524514
  :	:
  (13195, 11745)	0.098511212202
  (13195, 34643)	0.244383302654
  (13195, 37204)	0.142427245395
  (13195, 16087)	0.0971914389405
  (13195, 16164)	0.08262312955
  (13195, 16157)	0.0747259525996
  (13195, 10714)	0.0974760327265
  (13195, 26845)	0.115670570417
  (13195, 34368)	0.0979120197791
  (13195, 238

In [7]:
# extract most common tags
def extract_tags_count(cleaned_df):
    tags_list = cleaned_df['tags'].str.split(pat=' ').tolist()
    total_tags = pd.Series([item for sublist in tags_list for item in sublist])
    print("total tags count : %d" % len(total_tags))
    total_tags = pd.DataFrame(total_tags.value_counts(), columns=['count'])
    print("unique tags count : %d" % len(total_tags))
    display(total_tags.describe())
    display(total_tags.head())
    return total_tags


biology_total_tags = extract_tags_count(biology_cleaned_df)
cooking_total_tags = extract_tags_count(cooking_cleaned_df)
crypto_total_tags = extract_tags_count(crypto_cleaned_df)
diy_total_tags = extract_tags_count(diy_cleaned_df)
robotics_total_tags = extract_tags_count(robotics_cleaned_df)
travel_total_tags = extract_tags_count(travel_cleaned_df)

total tags count : 33129
unique tags count : 678


Unnamed: 0,count
count,678.0
mean,48.862832
std,126.580001
min,1.0
25%,5.0
50%,14.0
75%,38.0
max,1448.0


Unnamed: 0,count
human-biology,1448
genetics,1229
evolution,1159
biochemistry,984
molecular-biology,863


total tags count : 35542
unique tags count : 736


Unnamed: 0,count
count,736.0
mean,48.290761
std,106.684593
min,1.0
25%,7.0
50%,18.0
75%,43.0
max,1444.0


Unnamed: 0,count
baking,1444
food-safety,1211
substitutions,920
equipment,816
bread,687


total tags count : 25484
unique tags count : 392


Unnamed: 0,count
count,392.0
mean,65.010204
std,156.04103
min,1.0
25%,6.0
50%,18.5
75%,54.0
max,1783.0


Unnamed: 0,count
encryption,1783
hash,1141
rsa,1095
aes,923
public-key,842


total tags count : 59129
unique tags count : 734


Unnamed: 0,count
count,734.0
mean,80.557221
std,227.93661
min,1.0
25%,9.0
50%,25.0
75%,72.0
max,4490.0


Unnamed: 0,count
electrical,4490
plumbing,2223
wiring,1674
lighting,1003
hvac,922


total tags count : 6520
unique tags count : 231


Unnamed: 0,count
count,231.0
mean,28.225108
std,48.026908
min,1.0
25%,4.5
50%,11.0
75%,31.0
max,306.0


Unnamed: 0,count
quadcopter,306
mobile-robot,295
arduino,282
control,255
motor,239


total tags count : 65334
unique tags count : 1645


Unnamed: 0,count
count,1645.0
mean,39.716717
std,157.570955
min,1.0
25%,3.0
50%,7.0
75%,23.0
max,3829.0


Unnamed: 0,count
visas,3829
air-travel,2273
usa,2168
schengen,1561
uk,1492


In [8]:
# predict which category.
def create_category_added_df(df, category):
    temp_df = df.copy()
    temp_df['category'] = category
    return temp_df


full_df = pd.concat([create_category_added_df(biology_cleaned_df, 'biology'),
                    create_category_added_df(cooking_cleaned_df, 'cooking'),
                    create_category_added_df(crypto_cleaned_df, 'crypto'),
                    create_category_added_df(diy_cleaned_df, 'diy'),
                    create_category_added_df(robotics_cleaned_df, 'robotics'),
                    create_category_added_df(travel_cleaned_df, 'travel')]
                   )

print(len(full_df))
print(full_df.head())

87000
   id                                              title  \
0   1  what critic ribosom bind site relat start codo...   
1   2         how rnase contamin rna base experi prevent   
2   3               are lymphocyt size cluster two group   
3   4     how long antibioticdos lb maintain good select   
4   5                 is exon order alway preserv splice   

                                             content  \
0  in prokaryot translat critic effici translat l...   
1  doe anyon suggest prevent rnase contamin work ...   
2  tortora write principl anatomi physiolog lymph...   
3  various peopl lab prepar liter lb add kanamyci...   
4  are case splice machineri construct mrna exon ...   

                                                tags category  
0  ribosome binding-sites translation synthetic-b...  biology  
1                                   rna biochemistry  biology  
2                 immunology cell-biology hematology  biology  
3                                       

In [30]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

stop_words = text.ENGLISH_STOP_WORDS
full_df_vectorizer = TfidfVectorizer(stop_words=stop_words)
full_df_vectors = full_df_vectorizer.fit_transform((full_df['title'] + " " + full_df['content']).tolist())
print(len(full_df_vectorizer.get_feature_names()))
X_train, X_test, y_train, y_test = train_test_split(full_df_vectors, full_df['category'], test_size=0.3, random_state=42)

In [29]:
# SGD classifier for predict category.
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="modified_huber")
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

149242


0.98061302681992335

In [68]:
def add_has_tag_columns(cleaned_df_input, total_tags_df, min_tag_apper_count, in_place=False):
    tag_split_df = cleaned_df_input
    if not in_place:
        tag_split_df = cleaned_df_input.copy()
        
    if not 'split_tag' in tag_split_df.columns:
        tag_split_df['split_tag'] = tag_split_df['tags'].str.split()
        
    tags = total_tags_df[total_tags_df['count'] >= min_tag_apper_count].index
    print("start")
    print("total tags : %d" % len(tags))
    for tag in tags:
        print("add tag : %s" % tag)
        tag_split_df.apply(lambda row: tag in row['split_tag'], axis=1)
    print("finish")
    
    return tag_split_df

#print(add_has_tag_columns(biology_tag_split_df, biology_total_tags, 5))

def maybe_pickle_has_tag_df(file_name, cleaned_df, total_tags_df, min_tag_apper_count, force=False):
    pickle_file_name = maybe_pickle(file_name + "_has_tag", lambda x: add_has_tag_columns(cleaned_df, total_tags_df, min_tag_apper_count), force)
    
    with open(pickle_file_name, 'rb') as f:
        return pickle.load(f)

In [69]:
biology_has_tag_df = maybe_pickle_has_tag_df('biology', biology_cleaned_df, biology_total_tags, 5)

pickle/biology_has_tag.pickle already present - Skipping pickling.


In [70]:
cooking_has_tag_df = maybe_pickle_has_tag_df('cooking', cooking_cleaned_df, cooking_total_tags, 5)

pickle/cooking_has_tag.pickle already present - Skipping pickling.


In [71]:
crypto_has_tag_df = maybe_pickle_has_tag_df('crypto', crypto_cleaned_df, crypto_total_tags, 5)

Pickling pickle/crypto_has_tag.pickle.
start
total tags : 314
add tag : encryption
add tag : hash
add tag : rsa
add tag : aes
add tag : public-key
add tag : cryptanalysis
add tag : elliptic-curves
add tag : signature
add tag : block-cipher
add tag : algorithm-design
add tag : protocol-design
add tag : random-number-generator
add tag : diffie-hellman
add tag : keys
add tag : authentication
add tag : symmetric
add tag : homomorphic-encryption
add tag : provable-security
add tag : hmac
add tag : reference-request
add tag : implementation
add tag : mac
add tag : tls
add tag : passwords
add tag : collision-resistance
add tag : key-exchange
add tag : key-derivation
add tag : stream-cipher
add tag : authenticated-encryption
add tag : cbc
add tag : modes-of-operation
add tag : dsa
add tag : one-time-pad
add tag : discrete-logarithm
add tag : modular-arithmetic
add tag : des
add tag : randomness
add tag : initialization-vector
add tag : prime-numbers
add tag : zero-knowledge-proofs
add tag : sh

In [72]:
diy_has_tag_df = maybe_pickle_has_tag_df('diy', diy_cleaned_df, diy_total_tags, 5)

Pickling pickle/diy_has_tag.pickle.
start
total tags : 629
add tag : electrical
add tag : plumbing
add tag : wiring
add tag : lighting
add tag : hvac
add tag : bathroom
add tag : repair
add tag : water
add tag : wood
add tag : drywall
add tag : concrete
add tag : insulation
add tag : walls
add tag : doors
add tag : basement
add tag : flooring
add tag : kitchens
add tag : switch
add tag : shower
add tag : heating
add tag : windows
add tag : tile
add tag : air-conditioning
add tag : painting
add tag : leak
add tag : ceiling
add tag : furnace
add tag : toilet
add tag : water-heater
add tag : floor
add tag : light-fixture
add tag : roof
add tag : paint
add tag : thermostat
add tag : tools
add tag : framing
add tag : circuit-breaker
add tag : woodworking
add tag : cleaning
add tag : receptacle
add tag : deck
add tag : gfci
add tag : garage
add tag : faucet
add tag : code-compliance
add tag : ceiling-fan
add tag : drain
add tag : pipe
add tag : attic
add tag : safety
add tag : foundation
add

In [73]:
robotics_has_tag_df = maybe_pickle_has_tag_df('robotics', robotics_cleaned_df, robotics_total_tags, 5)

Pickling pickle/robotics_has_tag.pickle.
start
total tags : 173
add tag : quadcopter
add tag : mobile-robot
add tag : arduino
add tag : control
add tag : motor
add tag : sensors
add tag : robotic-arm
add tag : pid
add tag : localization
add tag : microcontroller
add tag : slam
add tag : ros
add tag : raspberry-pi
add tag : irobot-create
add tag : wheeled-robot
add tag : kinematics
add tag : design
add tag : kalman-filter
add tag : computer-vision
add tag : imu
add tag : motion-planning
add tag : inverse-kinematics
add tag : mechanism
add tag : brushless-motor
add tag : battery
add tag : power
add tag : cameras
add tag : stepper-motor
add tag : accelerometer
add tag : electronics
add tag : navigation
add tag : software
add tag : algorithm
add tag : kinect
add tag : servos
add tag : gyroscope
add tag : actuator
add tag : matlab
add tag : dynamics
add tag : ekf
add tag : sensor-fusion
add tag : servomotor
add tag : torque
add tag : mapping
add tag : esc
add tag : rcservo
add tag : industr

In [74]:
travel_has_tag_df = maybe_pickle_has_tag_df('travel', travel_cleaned_df, travel_total_tags, 5)

Pickling pickle/travel_has_tag.pickle.
start
total tags : 1022
add tag : visas
add tag : air-travel
add tag : usa
add tag : schengen
add tag : uk
add tag : customs-and-immigration
add tag : transit
add tag : trains
add tag : passports
add tag : public-transport
add tag : luggage
add tag : tickets
add tag : legal
add tag : budget
add tag : canada
add tag : indian-citizens
add tag : europe
add tag : india
add tag : online-resources
add tag : germany
add tag : france
add tag : airports
add tag : japan
add tag : international-travel
add tag : safety
add tag : airlines
add tag : money
add tag : health
add tag : airport-transfer
add tag : food-and-drink
add tag : planning
add tag : bookings
add tag : driving
add tag : visa-refusal
add tag : china
add tag : us-citizens
add tag : paperwork
add tag : transportation
add tag : london
add tag : cellphones
add tag : layovers
add tag : australia
add tag : italy
add tag : hotels
add tag : buses
add tag : car-rentals
add tag : tips-and-tricks
add tag 

In [None]:
# TODO SGC classifier tor predict tags. train with one-vs-rest approach
X_train, X_test, y_train, y_test = train_test_split(full_df_vectors, full_df['category'], test_size=0.3, random_state=42)