based on [this comment][1]

  [1]: https://www.kaggle.com/c/facebook-recruiting-iii-keyword-extraction/forums/t/6650/share-your-approach?forumMessageId=36434#post36434

In [1]:
import numpy as np
import pandas as pd
import IPython.display
from six.moves import cPickle as pickle
from tqdm import tqdm
tqdm.pandas()

def maybe_pickle(file_name, load_dataset, force=False):
    pickle_file_name = "pickle/" + file_name + ".pickle"
    import os
    if not os.path.exists("pickle"):
        os.makedirs("pickle")
        
    if os.path.exists(pickle_file_name) and not force:
        # You may override by setting force=True.
        print('%s already present - Skipping pickling.' % pickle_file_name)
    else:
        print('Pickling %s.' % pickle_file_name)
        dataset = load_dataset(None)
        try:
            with open(pickle_file_name, 'wb') as f:
                pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
        except Exception as e:
            print('Unable to save data to', file_name, ':', e)
    
    return pickle_file_name

def load_data(file_name, force=False):
    original_file_path = "../input/" + file_name + ".csv"
    pickle_file_name = maybe_pickle(file_name, lambda x: pd.read_csv(original_file_path), force)
    
    with open(pickle_file_name, 'rb') as f:
        return pickle.load(f)

In [2]:
biology = load_data("biology")
cooking = load_data("cooking")
crypto = load_data("crypto")
diy = load_data("diy")
robotics = load_data("robotics")
travel = load_data("travel")

pickle/biology.pickle already present - Skipping pickling.
pickle/cooking.pickle already present - Skipping pickling.
pickle/crypto.pickle already present - Skipping pickling.
pickle/diy.pickle already present - Skipping pickling.
pickle/robotics.pickle already present - Skipping pickling.
pickle/travel.pickle already present - Skipping pickling.


In [3]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
stemmer = SnowballStemmer("english")
wordnet_lemmatizer = WordNetLemmatizer()
punctuation_trans_table = str.maketrans({key: None for key in string.punctuation})
html_tag_regex = re.compile('<.*?>')

def cleaning_text(text):
    # TODO remove code fragment
    # TODO remove url
    # TODO convert to lowercase
    # TODO add meta features from original text
    ## length of the raw text in chars
    ## number of code segments
    ## number of 'a href' tags
    ## number of times 'http' occurs (count urls)
    ## number of times 'grater sign' occurs (count html tags)
    # TODO add meta features from cleaned text
    ## number of words(tokens) in the clean text
    ## length of the clean text in chars
    # TODO feature scaling(0-1 range) with min-max
    
    # remove html tags
    text = re.sub(html_tag_regex, '', text)
    # remove \r, \n
    text = text.replace('\n', ' ').replace('\r', '')
    # remove Punctuations
    text = text.translate(punctuation_trans_table)
    # split
    words = word_tokenize(text)
    # remove stop words
    words = [word for word in words if word not in stopwords.words('english')]
    # lemmatizing, stemming
    words = [wordnet_lemmatizer.lemmatize(word) for word in words]
    words = [stemmer.stem(word) for word in words]
    # join
    text = ' '.join(words)
    return text


def cleaning(row):
    row['title'] = cleaning_text(row['title'])
    row['content'] = cleaning_text(row['content'])
    return row

In [4]:
def load_cleaned_df(file_name, force=False):
    original_file_path = "../input/" + file_name + ".csv"
    df = pd.read_csv(original_file_path)
    print("total len : %d" % len(df))
    return df.progress_apply(cleaning, axis=1)
    
def maybe_pickle_cleaned_df(file_name, force=False):
    pickle_file_name = maybe_pickle(file_name + "_cleaned", lambda x: load_cleaned_df(file_name), force)
    
    with open(pickle_file_name, 'rb') as f:
        return pickle.load(f)

In [6]:
biology_cleaned_df = maybe_pickle_cleaned_df('biology')
cooking_cleaned_df = maybe_pickle_cleaned_df('cooking')
crypto_cleaned_df = maybe_pickle_cleaned_df('crypto')
diy_cleaned_df = maybe_pickle_cleaned_df('diy')
robotics_cleaned_df = maybe_pickle_cleaned_df('robotics')
travel_cleaned_df = maybe_pickle_cleaned_df('travel')

pickle/biology_cleaned.pickle already present - Skipping pickling.
pickle/cooking_cleaned.pickle already present - Skipping pickling.
pickle/crypto_cleaned.pickle already present - Skipping pickling.
pickle/diy_cleaned.pickle already present - Skipping pickling.
pickle/robotics_cleaned.pickle already present - Skipping pickling.
pickle/travel_cleaned.pickle already present - Skipping pickling.


In [9]:
from sklearn.feature_extraction import text
stop_words = text.ENGLISH_STOP_WORDS

from sklearn.feature_extraction.text import TfidfVectorizer
biology_vectorizer = TfidfVectorizer(stop_words=stop_words)
biology_content_vectors = biology_vectorizer.fit_transform(biology_cleaned_df['content'].tolist())
print(biology_content_vectors)

  (0, 12523)	0.120612338712
  (0, 25023)	0.157488359995
  (0, 6369)	0.133603543064
  (0, 6090)	0.348485775607
  (0, 3445)	0.32400093874
  (0, 33104)	0.155603571764
  (0, 18675)	0.195261252058
  (0, 9339)	0.195943328919
  (0, 32441)	0.267732578259
  (0, 29433)	0.124191996023
  (0, 31584)	0.148569968154
  (0, 6780)	0.156742573526
  (0, 29886)	0.201563513586
  (0, 21244)	0.161736628034
  (0, 12532)	0.173535276741
  (0, 10235)	0.216224552536
  (0, 34574)	0.545545939178
  (0, 28049)	0.201563513586
  (1, 34252)	0.220712540775
  (1, 27063)	0.266245082469
  (1, 14152)	0.210234920737
  (1, 36362)	0.146729302443
  (1, 14817)	0.181685382537
  (1, 34630)	0.198896388388
  (1, 11134)	0.318954513452
  :	:
  (13195, 22807)	0.141562555416
  (13195, 30835)	0.223224543698
  (13195, 36678)	0.122803495835
  (13195, 7344)	0.113705734928
  (13195, 14054)	0.122182509878
  (13195, 29537)	0.0930735408987
  (13195, 23854)	0.111943840168
  (13195, 34368)	0.0979120197791
  (13195, 26845)	0.115670570417
  (13195, 1

In [1]:
# TODO extract most common tags
tags_list = cleaned_df['tags'].str.split(pat=' ').tolist()
total_tags = pd.Series([item for sublist in tags_list for item in sublist])
print(len(total_tags))
print(total_tags.value_counts())

NameError: name 'cleaned_df' is not defined

In [7]:
# TODO train SGD classifier with one-vs-rest approach
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="modified_huber")