In [2]:
# %load helper.py
import pandas as pd
import numpy as np
import sys
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import gensim
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer
import unicodedata
stopword_list = nltk.corpus.stopwords.words('english')
wnl = WordNetLemmatizer()


CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')


def tokenize_text(text):
    if isinstance(text, unicode):
        text = strip_accents(text)
    text = text.encode('ascii', 'ignore')
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    return tokens


def expand_contractions(text, contraction_mapping):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) \
            if contraction_mapping.get(match) \
            else contraction_mapping.get(match.lower())
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text


from pattern.en import tag
from nltk.corpus import wordnet as wn


# Annotate text tokens with POS tags
def pos_tag_text(text):
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None

    tagged_text = tag(text)
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text


# lemmatize text based on POS tags
def lemmatize_text(text):
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
                         else word
                         for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text


def remove_special_characters(text):
    # remove numbers
    text = re.sub("\d+", "", text)

    tokens = tokenize_text(text)
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text


def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text


def normalize_corpus(corpus, tokenize=False):
    normalized_corpus = []
    for text in tqdm(corpus):
        text = expand_contractions(text, CONTRACTION_MAP)
        text = lemmatize_text(text)
        text = remove_special_characters(text)
        text = remove_stopwords(text)
        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)
        else:
            normalized_corpus.append(text)
    return normalized_corpus


def get_topn_tags_transform(path,topn,tag):
    """
    read cleaned data and transform them into one tag per row
    1. get top n tags
    2. expand row
    3. to boolean
    4. aggregate by content
    """
    df = pd.read_csv(path,quotechar='|',sep=',',header=None)
    df.columns = ['title','body','tags']
    merged = [ title + ' ' + body for title, body in zip(df.title,df.body)]
    df_merged = pd.DataFrame({'content':merged,'tags':df.tags.copy()})
    df_merged.tags = df_merged.tags.apply(lambda x: x.replace('<','').split('>')[:-1])
    df_transformed = pd.DataFrame(df_merged.tags.tolist(),index=df_merged.content).stack().reset_index()[['content',0]]
    df_transformed.columns = ['content','tags']
    top_tags = Counter(df_transformed.tags).most_common()[:topn]
    top_n_tags = [tag for tag, num in top_tags]
    df_filtered = df_transformed[df_transformed.tags.apply(lambda x: x in set(top_n_tags))]
    df_filtered.tags = [int(bool) for bool in df_filtered.tags == tag]
    df_filtered.columns = ['content','is_{}'.format(tag)]
    rslt = df_filtered.groupby('content')['is_{}'.format(tag)].agg(['sum']).reset_index()
    rslt.columns = ['content','is_{}'.format(tag)]
    return rslt, top_n_tags



class ExtractAverageWordVectors(BaseEstimator,TransformerMixin):
    def __init__(self,g_model):
        self.g_model = g_model

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if self.g_model:
            return self.averaged_word_vectors(X)

    def averaged_word_vectors(self, tokenized_list):
        weighted_ave = []

        for sentence in tqdm(tokenized_list):
            word_vecs = np.array([self.g_model[word] for word in sentence if word in self.g_model])

            weighted_ave.append(np.sum(word_vecs, axis = 0)/len(word_vecs))

        return np.array(weighted_ave)


class ExtractTfidfAveVec(BaseEstimator,TransformerMixin):
    def __init__(self,corpus, g_model):
        self.corpus = corpus
        self.g_model = g_model

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return self.ave_weighted_tfidf(X) 

    def tfidf_extractor(self, ngram_range=(1,1)):

        tfidf_obj = TfidfVectorizer(min_df=3, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=ngram_range, use_idf = 1, smooth_idf = 1, sublinear_tf = 1, stop_words='english')

        tfidf_features = tfidf_obj.fit_transform(self.corpus)
        
        return tfidf_obj, tfidf_features

    def tfidf_mapper(self, tfidf_obj, tfidf_features):
        vocab = tfidf_obj.vocabulary_
        words = vocab.keys()
        word_tfidfs = [tfidf_features[0, vocab.get(word)] if vocab.get(word) else 0 for word in words]
        word_tfidf_map = {word:tfidf_val for word, tfidf_val in zip(words, word_tfidfs)}
        
        return word_tfidf_map

    def ave_weighted_tfidf(self, tokenized_list):
        self.tokenized_list = tokenized_list
        weighted_ave = []
        tfidf_obj, tfidf_features = self.tfidf_extractor()
        word_tfidf_map = self.tfidf_mapper(tfidf_obj, tfidf_features ) 

        for sentence in tqdm(self.tokenized_list):
            word_vecs =  np.array([self.g_model[word] * self.word_in_word_tfidf_map(word, word_tfidf_map)  for word in sentence if word in self.g_model])

            weighted_ave.append(np.sum(word_vecs, axis = 0)/len(word_vecs))

        return np.array(weighted_ave)
    
    def word_in_word_tfidf_map(self, word, word_tfidf_map):
        if word in word_tfidf_map.values():
            return word_tfidf_map[word]
        else: 
            return 1

In [4]:
# %load model_ppl.py
from helper import *
from sklearn.base import BaseEstimator,TransformerMixin
import pandas as pd
import itertools
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from helper import normalize_corpus
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import RandomizedSearchCV,cross_val_score,StratifiedShuffleSplit,train_test_split
from sklearn.metrics import recall_score,accuracy_score,make_scorer
import numpy as np
from tqdm import tqdm
from gensim import models,corpora
from sklearn.svm import SVC
import xgboost as xgb



class ParsedDataTransformer(BaseEstimator,TransformerMixin):
    """
    transform form parsed xml file into topn df
    df.content is a string sentence which is not cleaned
    df.tags is a list of SINGLE TAG only have top n tags
    """
    def __init__(self, topn_tags):
        self.topn_tags = topn_tags

    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        X.columns = ['title', 'body', 'tags']
        merged = [title + ' ' + body for title, body in zip(X.title, X.body)]
        df_merged = pd.DataFrame({'content': merged, 'tags': X.tags.copy()})
        data_tags = df_merged.tags.apply(lambda x: x.replace('<','').split('>')[:-1])
        new_targets = []
        for sample_tags in data_tags:
            sample_tags_wanted = []
            for tag in sample_tags:
                if tag in self.topn_tags:
                    sample_tags_wanted.append(tag)
            new_targets.append(sample_tags_wanted)
        bool_index = [True if t != [] else False for t in new_targets]
        rslt = pd.DataFrame({'content': df_merged.content, 'tags': new_targets}).loc[bool_index, :]
        return rslt.reset_index(drop=True)


class DropRowWithMultipleTags(BaseEstimator,TransformerMixin):
    """
    to reduce overlap, we drop those question with multiple top N tags
    """
    def __init__(self):
        pass

    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        mask = []
        for row in X.tags:
            if len(row) == 1:
                mask.append(True)
            else:
                mask.append(False)
        return X[mask]



class ContentCleaner(BaseEstimator,TransformerMixin):
    """
    using helper function to clean the content text
    """
    def __init__(self):
        pass

    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        # fully cleaned text
        noramalized_docs = normalize_corpus(X.content,tokenize=False)
        return pd.DataFrame({'content':noramalized_docs,'tags':X.tags})


class BOWVector(BaseEstimator,TransformerMixin):
    """
    to bag of words vector
    """
    def __init__(self,topn_tags):
        self.topn_tags = topn_tags

    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        content = ' '.join(list(X.content))
        uniq_words = set(content.split())
        word_to_int = { word:i for i,word in enumerate(uniq_words)}
        int_to_word = {i:word for i,word in enumerate(uniq_words)}
        content_vecs = []
        for sentence in X.content:
            vecs = np.zeros(len(uniq_words))
            for word in sentence.split():
                vecs[word_to_int[word]] += 1
            for tag in topn_tags:
                vecs[word_to_int[word]] *= 77  # sorry, it's magic number
            content_vecs.append(vecs)
        return np.array(content_vecs)



class ExtractTfidfAveVec(BaseEstimator, TransformerMixin):
    """
    get the word to vector model, tfidf model, use them to get the sentence vector
    """
    def __init__(self,vec_len,topn_tags):
        self.vec_len = vec_len
        self.topn_tags = topn_tags

    def fit(self, X, y=None):
        self.corpus = list(X.content.values.flatten())
        self.g_model = gensim.models.Word2Vec([s.split() for s in self.corpus], size=self.vec_len, window=10, min_count=2, sample=1e-3)

        return self

    def transform(self, X, y=None):
        return self.ave_weighted_tfidf(X.content.values.flatten())

    def tfidf_extractor(self, ngram_range=(1, 1)):

        tfidf_obj = TfidfVectorizer(min_df=3, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                                    ngram_range=ngram_range, use_idf=1, smooth_idf=1, sublinear_tf=1,
                                    stop_words='english')

        tfidf_features = tfidf_obj.fit_transform(self.corpus)

        return tfidf_obj, tfidf_features

    def tfidf_mapper(self, tfidf_obj, tfidf_features):
        vocab = tfidf_obj.vocabulary_
        words = vocab.keys()
        word_tfidfs = [tfidf_features[0, vocab.get(word)] if vocab.get(word) else 0 for word in words]
        word_tfidf_map = {word: tfidf_val for word, tfidf_val in zip(words, word_tfidfs)}

        #add more main language weight
        for tag in self.topn_tags:
            word_tfidf_map[tag] = 100*word_tfidf_map.get(tag,np.zeros(self.vec_len))
        return word_tfidf_map

    def ave_weighted_tfidf(self, tokenized_list):
        self.tokenized_list = tokenized_list
        weighted_ave = []
        tfidf_obj, tfidf_features = self.tfidf_extractor()
        word_tfidf_map = self.tfidf_mapper(tfidf_obj, tfidf_features)

        for sentence in tqdm(self.tokenized_list):
            word_vecs = np.array(
                [self.g_model[word] * self.word_in_word_tfidf_map(word, word_tfidf_map) for word in sentence.split() if
                 word in self.g_model])

            weighted_ave.append(np.sum(word_vecs, axis=0) / len(word_vecs))

        return np.array(weighted_ave)

    def word_in_word_tfidf_map(self, word, word_tfidf_map):
        if word in word_tfidf_map.values():
            return word_tfidf_map[word]
        else:
            return 1


class LDATransformer(BaseEstimator, TransformerMixin):
    def __init__(self, num_topics, passes):
        self.num_topics = num_topics
        self.passes = passes

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        texts = X.content.apply(lambda x: x.split())
        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        print 'training LDA...'
        ldamodel = models.ldamodel.LdaModel(
            corpus, num_topics=self.num_topics, id2word=dictionary, passes=self.passes)
        dict_values = {i: [] for i in range(10)}
        for sample in ldamodel.get_document_topics(corpus):
            row = np.zeros(self.num_topics)
            for topic_id, value in sample:
                row[topic_id] = value
            for i, v in enumerate(row):
                dict_values[i].append(v)
        return pd.concat((pd.DataFrame(dict_values), X.iloc[:, 1]), axis=1)


def xgb_random_search(x_train,y_train,num_iter=5):
    param_distribs = {
        'estimator__max_depth': [3, 4],
        'estimator__learning_rate': [0.01, 0.05, 0.1, 0.5],
        'estimator__n_estimators': list(range(100,1000,100)),
        'estimator__colsample_bytree': [0.5,0.7,0.9],
    }

    gbm = OneVsRestClassifier(xgb.XGBClassifier(),n_jobs=-1)
    def score_func(y, y_pred, **kwargs):
        return recall_score(y,y_pred,average='macro')
    rnd_search = RandomizedSearchCV(gbm, param_distribs, n_iter=num_iter, cv=5,scoring=make_scorer(score_func))
    rnd_search.fit(x_train, y_train)

    cvres = rnd_search.cv_results_
    scores_list = []
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        scores_list.append((mean_score, params))
    highest_score, highest_paras = sorted(scores_list, key=lambda x: x[0], reverse=True)[0]
    print('best score: {}, best paras:{}'.format(highest_score, highest_paras))
    return highest_score,highest_paras


def svm_random_search(x_train, y_train, num_iter=10):
    param_distribs = {
        'estimator__kernel':[ 'linear', 'poly', 'rbf', 'sigmoid'],
        'estimator__C' : list(np.linspace(0.01,1,20)),
        "estimator__degree": [1, 2, 3, 4],
        "estimator__class_weight":['balanced'],
        "estimator__probability":[True]
    }

    ovsr = OneVsRestClassifier(SVC(),n_jobs=-1)

    def score_func(y, y_pred, **kwargs):
        return recall_score(y,y_pred,average='macro')

    rnd_search = RandomizedSearchCV(ovsr, param_distribs, n_iter=num_iter, cv=5,scoring=make_scorer(score_func))
    rnd_search.fit(x_train, y_train)

    cvres = rnd_search.cv_results_
    scores_list = []
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        scores_list.append((mean_score, params))
    highest_score, highest_paras = sorted(scores_list, key=lambda x: x[0], reverse=True)[0]
    print('best score: {}, best paras:{}'.format(highest_score, highest_paras))
    return highest_score,highest_paras


class TargetBinerizer(BaseEstimator,TransformerMixin):
    """
    transform the target into one-hot like vector
    """
    def __init__(self):
        pass

    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        mlb = MultiLabelBinarizer().fit(X.tags)
        y = mlb.transform(X.tags)
        return y,mlb

def bool_to_int(array):
    rslt = []
    for b in array:
        rslt.append(int(b))
    return rslt

if __name__ == '__main__':

    df = pd.read_csv('./data/processed/stack_ds_4_9_2017 .csv', quotechar='|', sep=',', header=None )
    topn_tags = ['javascript', 'java', 'android', 'php', 'python', 'c#', 'html', 'jquery', 'ios', 'css']


    ppl_X = Pipeline([
        ('transformer',ParsedDataTransformer(topn_tags)),
        ('droprowwithgtonetag',DropRowWithMultipleTags()),
        ('textCleaner',ContentCleaner()),
        # ('doc2vecTFIDFtransformer',ExtractTfidfAveVec(vec_len=15,topn_tags=topn_tags)),
    ])

    ppl_y = Pipeline([
        ('transformer',ParsedDataTransformer(topn_tags)),
        ('droprowwithgtonetag', DropRowWithMultipleTags()),
        ('targetBinerizer',TargetBinerizer())
    ])


    X = ppl_X.fit_transform(df)
    y,mlb = ppl_y.fit_transform(df)

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

100%|██████████| 1961/1961 [00:48<00:00, 40.42it/s]


In [21]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
df = x_train

In [170]:
mlb = MultiLabelBinarizer().fit(df.tags)
y = mlb.transform(df.tags)
mask_is = [bool(x) for x in y[:,-1]]
mask_not = [ not bool(x) for x in y[:,-1]]
ct_not = CountVectorizer(min_df=3).fit(df.content[mask_not])
x_not = ct_not.transform(df.content[mask_not])
features_not = ct_not.get_feature_names()

ct_is = CountVectorizer(min_df=3).fit(df.content[mask_is])
x_is = ct_is.transform(df.content[mask_is])
features_is = ct_is.get_feature_names()

mapper_is = {w:v for w,v in zip(features_is,np.array(x_is.todense()).sum(axis=0))}
mapper_not = {w:v for w,v in zip(features_not,np.array(x_not.todense()).sum(axis=0))}

In [171]:
mlb.classes_

array(['android', 'c#', 'css', 'html', 'ios', 'java', 'javascript',
       'jquery', 'php', 'python'], dtype=object)

In [172]:
ratios = []
for union_word in set(features_is) & set(features_not):
    ratio = float(mapper_is[union_word])/float(mapper_not[union_word]) * x_not.shape[0] / x_is.shape[0]
    ratio = np.log(ratio)
    if ratio >= 1:
        ratios.append((ratio,union_word))

In [173]:
ratios = sorted(ratios,key=lambda x:x[0],reverse=True)

In [174]:
ratios[:10]

[(6.095574280864156, u'python'),
 (3.2215389422876846, u'csv'),
 (2.8315780207154853, u'typeerror'),
 (2.6645239360523192, u'shell'),
 (2.2407096892759584, u'matrix'),
 (2.212538812309262, u'relative'),
 (2.1637486481398298, u'pointer'),
 (2.0583881324820035, u'layer'),
 (1.9406050968256201, u'ps'),
 (1.9042374526547454, u'corresponding')]