In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
%matplotlib inline
import nltk
from nltk.corpus import stopwords
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

1) Load the dataset (5 points) 
    
    Tip: As the dataset is large, use fewer rows. 
    
    Check what is working well on your machine and decide accordingly.

In [2]:
corpus_df = pd.read_csv('blogtext.csv') ## import the dataset

In [3]:
corpus_df = corpus_df[0:10000] ## taking fist 10,000 entries

In [4]:
corpus_df.shape ## shape of the dataset

(10000, 7)

In [5]:
 corpus_df.columns ## attributes

Index(['id', 'gender', 'age', 'topic', 'sign', 'date', 'text'], dtype='object')

In [6]:
corpus_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
id        10000 non-null int64
gender    10000 non-null object
age       10000 non-null int64
topic     10000 non-null object
sign      10000 non-null object
date      10000 non-null object
text      10000 non-null object
dtypes: int64(2), object(5)
memory usage: 547.0+ KB


In [7]:
corpus_df.isna().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [8]:
 corpus_df.head(10)

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...
5,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",I had an interesting conversation...
6,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",Somehow Coca-Cola has a way of su...
7,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004","If anything, Korea is a country o..."
8,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",Take a read of this news article ...
9,3581210,male,33,InvestmentBanking,Aquarius,"09,June,2004",I surf the English news sites a l...


2) Preprocess rows of the “text” column (7.5 points)

a. Remove unwanted characters

b. Convert text to lowercase

c. Remove unwanted spaces

d. Remove stopwords

In [9]:
def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text

In [10]:
corpus_df['clean_text'] = corpus_df['text'].apply( lambda x: clean_text(x))

In [11]:
corpus_df.head()  ## after applying the cleaning function(removed special characters, spaces)

Unnamed: 0,id,gender,age,topic,sign,date,text,clean_text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,...",info has been found pages and mb of pdf files ...
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...,these are the team members drewes van der laag...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...,in het kader van kernfusie op aarde maak je ei...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!,testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...,thanks to yahoo s toolbar i can now capture th...


In [12]:
stop_words = set(stopwords.words('english'))

In [13]:
def remove_stopwords(text):          ## stop words removal functon
    no_stopword_text = [w for w in text.split() if w not in stop_words]
    return ' '.join(no_stopword_text)

In [14]:
corpus_df['clean_text'] = corpus_df['clean_text'].apply(lambda x: remove_stopwords(x))

In [15]:
corpus_df.head(10)  ## Dataset without stopwords

Unnamed: 0,id,gender,age,topic,sign,date,text,clean_text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,...",info found pages mb pdf files wait untill team...
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...,team members drewes van der laag urllink mail ...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...,het kader van kernfusie op aarde maak je eigen...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!,testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...,thanks yahoo toolbar capture urls popups means...
5,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",I had an interesting conversation...,interesting conversation dad morning talking k...
6,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",Somehow Coca-Cola has a way of su...,somehow coca cola way summing things well earl...
7,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004","If anything, Korea is a country o...",anything korea country extremes everything see...
8,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",Take a read of this news article ...,take read news article urllink joongang ilbo n...
9,3581210,male,33,InvestmentBanking,Aquarius,"09,June,2004",I surf the English news sites a l...,surf english news sites lot looking tidbits ko...


3) As we want to make this into a multi-label classification problem, you are required to merge all the label columns together, so that we have all the labels together for a particular sentence (7.5 points)

a. Label columns to merge: “gender”, “age”, “topic”, “sign”

b. After completing the previous step, there should be only two columns in your data
frame i.e. “text” and “labels” as shown in the below image


In [16]:
data_df = pd.DataFrame(columns = ['text','labels'])
corpus_df['age'] = corpus_df['age'].astype(str)
data_df['labels'] = corpus_df[['gender','age','topic','sign']].apply(lambda x: ','.join(x), axis=1)
data_df['text'] = corpus_df['clean_text']
data_df.head(10)

Unnamed: 0,text,labels
0,info found pages mb pdf files wait untill team...,"male,15,Student,Leo"
1,team members drewes van der laag urllink mail ...,"male,15,Student,Leo"
2,het kader van kernfusie op aarde maak je eigen...,"male,15,Student,Leo"
3,testing testing,"male,15,Student,Leo"
4,thanks yahoo toolbar capture urls popups means...,"male,33,InvestmentBanking,Aquarius"
5,interesting conversation dad morning talking k...,"male,33,InvestmentBanking,Aquarius"
6,somehow coca cola way summing things well earl...,"male,33,InvestmentBanking,Aquarius"
7,anything korea country extremes everything see...,"male,33,InvestmentBanking,Aquarius"
8,take read news article urllink joongang ilbo n...,"male,33,InvestmentBanking,Aquarius"
9,surf english news sites lot looking tidbits ko...,"male,33,InvestmentBanking,Aquarius"


In [17]:
data_df['labels'].values

array(['male,15,Student,Leo', 'male,15,Student,Leo',
       'male,15,Student,Leo', ..., 'female,25,indUnk,Pisces',
       'female,25,indUnk,Pisces', 'female,25,indUnk,Pisces'], dtype=object)

4) Separate features and labels, and split the data into training and testing (5 points)

In [19]:
X = data_df['text']
y = data_df['labels'].str.lower()

In [55]:
y[500]

'female,27,education,aquarius'

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=2)

In [57]:
X_train.shape

(7000,)

In [61]:
y_train[0]

'male,15,student,leo'

In [23]:
X_test.shape

(3000,)

5) Vectorize the features (5 points)

    a. Create a Bag of Words using count vectorizer
        
        i. Use ngram_range=(1, 2)
        
        ii. Vectorize training and testing features
    
    b. Print the term-document matrix

In [68]:
#creating bag of words
vectori = CountVectorizer(ngram_range=(1,2),min_df=1)

In [69]:
#creating the document_term_matrix
X_train_dtm = vectori.fit_transform(X_train)
X_test_dtm = vectori.transform(X_test)

In [70]:
print(X_train_dtm)

  (0, 36614)	1
  (0, 439197)	1
  (0, 282589)	2
  (0, 445068)	1
  (0, 159751)	1
  (0, 207686)	1
  (0, 422049)	1
  (0, 161216)	1
  (0, 318036)	1
  (0, 381810)	2
  (0, 465771)	1
  (0, 207232)	2
  (0, 333320)	1
  (0, 57187)	1
  (0, 416768)	2
  (0, 16696)	2
  (0, 355345)	3
  (0, 343512)	1
  (0, 186002)	1
  (0, 401969)	1
  (0, 242475)	1
  (0, 378573)	1
  (0, 381279)	1
  (0, 462844)	1
  (0, 21795)	1
  :	:
  (6999, 270200)	1
  (6999, 308239)	1
  (6999, 60514)	1
  (6999, 184537)	1
  (6999, 286375)	1
  (6999, 303239)	1
  (6999, 188542)	1
  (6999, 145807)	1
  (6999, 303637)	1
  (6999, 463338)	1
  (6999, 334051)	1
  (6999, 318361)	1
  (6999, 21027)	1
  (6999, 418346)	1
  (6999, 166542)	1
  (6999, 426235)	1
  (6999, 166541)	1
  (6999, 182654)	1
  (6999, 248932)	1
  (6999, 377091)	1
  (6999, 83809)	1
  (6999, 17324)	1
  (6999, 54798)	1
  (6999, 207852)	1
  (6999, 156521)	1



6) Create a dictionary to get the count of every label i.e. the key will be label name and value will be the total count of the label. (5 points)

In [71]:
vectorizer_labels = CountVectorizer(min_df = 1,ngram_range = (1,1),stop_words = "english")
labels_vector = vectorizer_labels.fit_transform(y)
vectorizer_labels.vocabulary_

{'male': 49,
 '15': 2,
 'student': 64,
 'leo': 46,
 '33': 10,
 'investmentbanking': 43,
 'aquarius': 25,
 'female': 38,
 '14': 1,
 'indunk': 41,
 'aries': 26,
 '25': 7,
 'capricorn': 32,
 '17': 4,
 'gemini': 39,
 '23': 5,
 'non': 53,
 'profit': 55,
 'cancer': 31,
 'banking': 29,
 '37': 14,
 'sagittarius': 59,
 '26': 8,
 '24': 6,
 'scorpio': 61,
 '27': 9,
 'education': 35,
 '45': 22,
 'engineering': 36,
 'libra': 47,
 'science': 60,
 '34': 11,
 '41': 18,
 'communications': 33,
 'media': 51,
 'businessservices': 30,
 'sports': 63,
 'recreation': 57,
 'virgo': 68,
 'taurus': 65,
 'arts': 27,
 'pisces': 54,
 '44': 21,
 '16': 3,
 'internet': 42,
 'museums': 52,
 'libraries': 48,
 'accounting': 24,
 '39': 16,
 '35': 12,
 'technology': 66,
 '36': 13,
 'law': 44,
 '46': 23,
 'consulting': 34,
 'automotive': 28,
 '42': 19,
 'religion': 58,
 '13': 0,
 'fashion': 37,
 '38': 15,
 '43': 20,
 'publishing': 56,
 '40': 17,
 'marketing': 50,
 'lawenforcement': 45,
 'security': 62,
 'humanresources': 40

In [72]:
# Extracing only key value from above dictionary, which contains unique labels. These set of labels will be used as classes in multilabelbinariser further.

labeled_classes = []  
for key in vectorizer_labels.vocabulary_.keys():
    labeled_classes.append(key)
    
print(sorted(labeled_classes))

['13', '14', '15', '16', '17', '23', '24', '25', '26', '27', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', 'accounting', 'aquarius', 'aries', 'arts', 'automotive', 'banking', 'businessservices', 'cancer', 'capricorn', 'communications', 'consulting', 'education', 'engineering', 'fashion', 'female', 'gemini', 'humanresources', 'indunk', 'internet', 'investmentbanking', 'law', 'lawenforcement', 'leo', 'libra', 'libraries', 'male', 'marketing', 'media', 'museums', 'non', 'pisces', 'profit', 'publishing', 'recreation', 'religion', 'sagittarius', 'science', 'scorpio', 'security', 'sports', 'student', 'taurus', 'technology', 'telecommunications', 'virgo']


In [73]:
len(labeled_classes)

69

7. Transform the labels - (7.5 points)

As we have noticed before, in this task each example can have multiple tags. To deal with
such kind of prediction, we need to transform labels in a binary form and the prediction will be
a mask of 0s and 1s. 

For this purpose, it is convenient to use MultiLabelBinarizer from sklearn

a. Convert your train and test labels using MultiLabelBinarizer

In [87]:
multilabel_binarizer = MultiLabelBinarizer(labeled_classes) # initialising multilabelbinariser

In [90]:
y_train_mlb=multilabel_binarizer.fit_transform(y_train)
y_test_mlb=multilabel_binarizer.fit_transform(y_test)

In [91]:
y_train_mlb[0]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0])

In [103]:
y_test_mlb[20]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0])

In [93]:
len(multilabel_binarizer.classes_)

69

In [94]:
multilabel_binarizer.classes_

array(['male', '15', 'student', 'leo', '33', 'investmentbanking',
       'aquarius', 'female', '14', 'indunk', 'aries', '25', 'capricorn',
       '17', 'gemini', '23', 'non', 'profit', 'cancer', 'banking', '37',
       'sagittarius', '26', '24', 'scorpio', '27', 'education', '45',
       'engineering', 'libra', 'science', '34', '41', 'communications',
       'media', 'businessservices', 'sports', 'recreation', 'virgo',
       'taurus', 'arts', 'pisces', '44', '16', 'internet', 'museums',
       'libraries', 'accounting', '39', '35', 'technology', '36', 'law',
       '46', 'consulting', 'automotive', '42', 'religion', '13',
       'fashion', '38', '43', 'publishing', '40', 'marketing',
       'lawenforcement', 'security', 'humanresources',
       'telecommunications'], dtype=object)

8) Choose a classifier - (5 points)

In this task, using the One-vs-Rest approach. In this approach k classifiers (= number of tags) are trained.

As a basic classifier, use LogisticRegression.

a. Use a linear classifier of your choice, wrap it up in OneVsRestClassifier to train it on
every label

b. As One-vs-Rest approach might not have been discussed in the sessions, we are
providing you the code for that

In [95]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [96]:
clf = LogisticRegression(solver='lbfgs',max_iter=1000)
clf = OneVsRestClassifier(clf)

9) Fit the classifier, make predictions and get the accuracy (5 points)

    a. Print the following
        
        i. Accuracy score

        ii. F1 score

        iii. Average precision score

        iv. Average recall score

In [97]:
# Fitting training  data
clf.fit(X_train_dtm,y_train_mlb)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [98]:
print("Train Accuracy:",clf.score(X_train_dtm,y_train_mlb))

Train Accuracy: 0.9611428571428572


In [99]:
y_predicted = clf.predict(X_test_dtm)

In [100]:
print("Test Accuracy:" + str(accuracy_score(y_test_mlb, y_predicted)))
print("F1: " + str(f1_score(y_test_mlb, y_predicted, average='micro')))
print("F1_macro: " + str(f1_score(y_test_mlb, y_predicted, average='macro')))
print("Precision: " + str(precision_score(y_test_mlb, y_predicted, average='micro')))
print("Precision_macro: " + str(precision_score(y_test_mlb, y_predicted, average='macro')))
print("Recall: " + str(recall_score(y_test_mlb, y_predicted, average='micro')))
print("Recall_macro: " + str(recall_score(y_test_mlb, y_predicted, average='macro')))

Test Accuracy:0.31433333333333335
F1: 0.6340836012861737
F1_macro: 0.22636353329949765
Precision: 0.7717037928958459
Precision_macro: 0.42086210912366706
Recall: 0.5381192275398825
Recall_macro: 0.17512732808324732


10) Print true label and predicted label for any five examples (7.5 points)

In [101]:
y_predicted_inv = multilabel_binarizer.inverse_transform(y_predicted)   # inverse transforming predited label data
y_test_mlb_inv =  multilabel_binarizer.inverse_transform(y_test_mlb) # inverse transforming original test label data

In [102]:
print(" predicted :",y_predicted_inv[0])
print(" Actual :",y_test_mlb_inv[0])

 predicted : ('male',)
 Actual : ('male', 'aries', '36', 'fashion')


In [104]:
print("Example 1 - predicted :",y_predicted_inv[0])
print("Example 1 - Actual :",y_test_mlb_inv[0])
print("Example 1 - Actual_before mlb transformation :",y_test[0])

Example 1 - predicted : ('male',)
Example 1 - Actual : ('male', 'aries', '36', 'fashion')
Example 1 - Actual_before mlb transformation : ['male', '36', 'fashion', 'aries']


In [105]:
print("Example 2 - predicted :",y_predicted_inv[22])
print("Example 2 - Actual :",y_test_mlb_inv[22])
print("Example 2 - Actual_before mlb transformation :",y_test[22])

Example 2 - predicted : ('male', 'aries', '35', 'technology')
Example 2 - Actual : ('male', 'aries', '35', 'technology')
Example 2 - Actual_before mlb transformation : ['male', '35', 'technology', 'aries']


In [106]:
print("Example 3 - predicted :",y_predicted_inv[225])
print("Example 3 - Actual :",y_test_mlb_inv[225])
print("Example 3 - Actual_before mlb transformation :",y_test[225])

Example 3 - predicted : ('female', 'indunk', '17', 'scorpio')
Example 3 - Actual : ('female', 'indunk', '17', 'scorpio')
Example 3 - Actual_before mlb transformation : ['female', '17', 'indunk', 'scorpio']


In [112]:
print("Example 4 - predicted :",y_predicted_inv[2500])
print("Example 4 - Actual :",y_test_mlb_inv[2500])
print("Example 4 - Actual_before mlb transformation :",y_test[2500])

Example 4 - predicted : ('male', 'aries')
Example 4 - Actual : ('male', 'aries', '35', 'technology')
Example 4 - Actual_before mlb transformation : ['male', '35', 'technology', 'aries']


In [111]:
print("Example 5 - predicted :",y_predicted_inv[1115])
print("Example 5 - Actual :",y_test_mlb_inv[1115])
print("Example 5 - Actual_before mlb transformation :",y_test[1115])

Example 5 - predicted : ('female', 'indunk', 'cancer')
Example 5 - Actual : ('male', 'indunk', 'cancer', '16')
Example 5 - Actual_before mlb transformation : ['male', '16', 'indunk', 'cancer']
