In [1]:
import sys
sys.path.append("..")
from common.download_utils import download_week1_resources

download_week1_resources()

File data/train.tsv is already downloaded.
File data/validation.tsv is already downloaded.
File data/test.tsv is already downloaded.
File data/text_prepare_tests.tsv is already downloaded.


### Text preprocessing

In [4]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/kolade/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
from ast import literal_eval
import pandas as pd
import numpy as np

In [6]:
def read_data(filename):
    data = pd.read_csv(filename, sep='\t')
    data['tags'] = data['tags'].apply(literal_eval)
    return data

In [7]:
train = read_data('data/train.tsv')
validation = read_data('data/validation.tsv')
test = pd.read_csv('data/test.tsv', sep='\t')

For a more comfortable usage, initialize *X_train*, *X_val*, *X_test*, *y_train*, *y_val*.

In [8]:
X_train  = train['title'].values
X_val = validation['title'].values
X_test = test['title'].values

In [9]:
y_train = train['tags'].values
y_val  =  validation['tags'].values

In [10]:
import re 

In [11]:

REPLACE_BY_SPACE_RE = re.compile(r'[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile(r'[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower()# lowercase text
    text = re.sub(REPLACE_BY_SPACE_RE," ", text) 
    text= re.sub(BAD_SYMBOLS_RE,"",text)
    text = " ".join(word for word in text.split() if word not in STOPWORDS)
    return text

In [12]:
def test_text_prepare():
    examples = ["SQL Server - any equivalent of Excel's CHOOSE function?",
                "How to free c++ memory vector<int> * arr?"]
    answers = ["sql server equivalent excels choose function", 
               "free c++ memory vectorint arr"]
    for ex, ans in zip(examples, answers):
        if text_prepare(ex) != ans:
            return "Wrong answer for the case: '%s'" % ex
    return 'Basic tests are passed.'

In [13]:
print(test_text_prepare())

Basic tests are passed.


In [15]:
X_train = [text_prepare(x) for x in X_train]
X_val = [text_prepare(x) for x in X_val]
X_test = [text_prepare(x) for x in X_test]

In [18]:
# Dictionary of all tags from train corpus with their counts.
# Dictionary of all words from train corpus with their counts.


def wordTagCount(words_data, tags_data):
    word_list = defaultdict(int)
    tag_list = defaultdict(int)
    for words in words_data:
        for word in words.split():
                word_list[word] +=1
    for tags in tags_data:
        for tag in tags:
                tag_list[tag] += 1
                
    return dict(word_list), dict(tag_list)


words_counts, tags_counts = wordTagCount(X_train, y_train)

We are assuming that *tags_counts* and *words_counts* are dictionaries like `{'some_word_or_tag': frequency}`. After applying the sorting procedure, results will be look like this: `[('most_popular_word_or_tag', frequency), ('less_popular_word_or_tag', frequency), ...]`. The grader gets the results in the following format (two comma-separated strings with line break):

    tag1,tag2,tag3
    word1,word2,word3

Pay attention that in this assignment you should not submit frequencies or some additional information.

In [19]:
most_common_tags = sorted(tags_counts.items(), key=lambda x: x[1], reverse=True)[:3]
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:3]



Current answer for task WordsTagsCount is:
 javascript,c#,java
using,php,java...


### Transforming text to a vector


#### Bag of words



In [20]:
a =['game','plan']
b = range(len(a))
c = dict(zip(a,b))

In [21]:

DICT_SIZE = 5000
most_common_word_index_pair = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:DICT_SIZE]
most_common= [word for word, i in most_common_word_index_pair]
range_most_common= range(len(most_common))
WORDS_TO_INDEX = dict (zip(most_common,range_most_common))
INDEX_TO_WORDS = dict(enumerate(most_common))
ALL_WORDS = WORDS_TO_INDEX.keys()

def my_bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    result_vector = np.zeros(dict_size)
    for word in text.split():
            for word2, index in (words_to_index.items()):
                if word == word2:
                    result_vector[index]+=1
    
    return result_vector

In [22]:
def test_my_bag_of_words():
    words_to_index = {'hi': 0, 'you': 1, 'me': 2, 'are': 3}
    examples = ['hi how are you']
    answers = [[1, 1, 0, 1]]
    for ex, ans in zip(examples, answers):
        if (my_bag_of_words(ex, words_to_index, 4) != ans).any():
            return "Wrong answer for the case: '%s'" % ex
    return 'Basic tests are passed.'

In [23]:
print(test_my_bag_of_words())

Basic tests are passed.


In [24]:
from scipy import sparse as sp_sparse

In [25]:
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])
print('X_train shape ', X_train_mybag.shape)
print('X_val shape ', X_val_mybag.shape)
print('X_test shape ', X_test_mybag.shape)

X_train shape  (100000, 5000)
X_val shape  (30000, 5000)
X_test shape  (20000, 5000)


In [26]:
X_train_mybag[10].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [27]:
row = X_train_mybag[10].toarray()
non_zero_elements_count = np.count_nonzero(row)


Current answer for task BagOfWords is:
 7...


#### TF-IDF



In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
def tfidf_features(X_train, X_val, X_test):
    """
        X_train, X_val, X_test — samples        
        return TF-IDF vectorized representation of each sample and vocabulary
    """
    # Create TF-IDF vectorizer with a proper parameters choice
    # Fit the vectorizer on the train set
    # Transform the train, test, and val sets and return the result
    
    from sklearn.feature_extraction.text import TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=5, max_df=0.9, token_pattern='(\S+)')
    X_train = tfidf_vectorizer.fit_transform(X_train)
    X_test = tfidf_vectorizer.transform(X_test)
    X_val = tfidf_vectorizer.transform(X_val)
    return X_train, X_val, X_test, tfidf_vectorizer.vocabulary_

In [30]:
X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, X_test)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}

### MultiLabel classifier


In [31]:
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder,LabelBinarizer

In [32]:
mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()),sparse_output=True)
y_train = mlb.fit_transform(y_train)
y_val = mlb.transform(y_val)

In [34]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier,SGDClassifier
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain


Train the classifiers for different data transformations: *bag-of-words* and *tf-idf*.

In [103]:
classifier_mybag = train_classifier(X_train_mybag, y_train)
classifier_tfidf = train_classifier(X_train_tfidf, y_train)

In [38]:
classifier_mybag = train_multi_classifier(X_train_mybag, y_train)
classifier_tfidf = train_multi_classifier(X_train_tfidf, y_train)

Now you can create predictions for the data. You will need two types of predictions: labels and scores.

In [41]:
y_val_predicted_labels_mybag = classifier_mybag.predict(X_val_mybag)
# y_val_predicted_scores_mybag = classifier_mybag.decision_function(X_val_mybag)

y_val_predicted_labels_tfidf = classifier_tfidf.predict(X_val_tfidf)
# y_val_predicted_scores_tfidf = classifier_tfidf.decision_function(X_val_tfidf)

Now take a look at how classifier, which uses TF-IDF, works for a few examples:

In [42]:
y_val_pred_inversed = mlb.inverse_transform(y_val_predicted_labels_tfidf)
y_val_inversed = mlb.inverse_transform(y_val)
for i in range(3):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_val[i],
        ','.join(y_val_inversed[i]),
        ','.join(y_val_pred_inversed[i])
    ))

Title:	odbc_exec always fail
True labels:	php,sql
Predicted labels:	


Title:	access base classes variable within child class
True labels:	javascript
Predicted labels:	


Title:	contenttype application json required rails
True labels:	ruby,ruby-on-rails
Predicted labels:	json,ruby-on-rails




### Evaluation



In [44]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score, classification_report

Implement the function *print_evaluation_scores* which calculates and prints to stdout:
 - *accuracy*
 - *F1-score macro/micro/weighted*
 - *Precision macro/micro/weighted*

In [45]:
def print_evaluation_scores(y_val, predicted):
    print('accracy: ',(accuracy_score(y_val, predicted)))
    print("F1 Score Weighted" ,(f1_score(y_val, predicted, average='weighted')))
    print("F1 Score micro",(f1_score(y_val, predicted, average='micro')))
    print("F1 Score Macro",(f1_score(y_val, predicted, average ='macro')))
    print(" Pre_ Micro",(average_precision_score(y_val, predicted,average='micro')))
    print( "Pre_ Macro", (average_precision_score(y_val, predicted,average='macro')))
    print("pre_weighted",(average_precision_score(y_val, predicted)))

In [62]:
y_val.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [46]:
print('Bag-of-words')
print_evaluation_scores(y_val.toarray(), y_val_predicted_labels_mybag.toarray())
print('Tfidf')
print_evaluation_scores(y_val.toarray(), y_val_predicted_labels_tfidf.toarray())

Bag-of-words
accracy:  0.367
F1 Score Weighted 0.6482776092038189
F1 Score micro 0.6755961760689148
F1 Score Macro 0.503050055351385
 Pre_ Micro 0.4905526148714524
Pre_ Macro 0.35574907030705466
pre_weighted 0.35574907030705466
Tfidf
accracy:  0.34973333333333334


  'precision', 'predicted', average, warn_for)


F1 Score Weighted 0.6287294393432716
F1 Score micro 0.6573939459694044
F1 Score Macro 0.4792702156015221
 Pre_ Micro 0.4754480800803637
Pre_ Macro 0.34068014929242885
pre_weighted 0.34068014929242885
