In [62]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import filterwarnings
filterwarnings(action = 'ignore')

In [34]:
#Reading the file
df = pd.read_csv('blogtext.csv')

In [35]:
#Taking a sample of 200k rows
df = df.sample(n = 5000)
df.reset_index(inplace = True)

In [36]:
#Looking at dtypes
df.dtypes

index      int64
id         int64
gender    object
age        int64
topic     object
sign      object
date      object
text      object
dtype: object

In [37]:
#Converting age to string
df['age'] = df['age'].astype('str')

In [38]:
df.head()

Unnamed: 0,index,id,gender,age,topic,sign,date,text
0,32564,3622069,female,23,indUnk,Aries,"08,July,2004",y i say like tat...wan sth u wish to kn...
1,562589,1157144,female,17,Student,Libra,"05,August,2004",OK so... birthday San Fran thing... is ...
2,257166,2815932,male,26,Science,Sagittarius,"05,April,2004",Here are your comments immortalised...
3,301715,3669029,female,13,indUnk,Libra,"20,July,2004",&nbsp;&nbsp; you kno its really wierd ...
4,396918,3636814,female,39,Fashion,Pisces,"29,June,2004",I've been considering getting a bike. N...


In [39]:
import re
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english')) 

def remove_stopwords(sentence):
    word_tokens = word_tokenize(sentence)
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    return " ".join(filtered_sentence)

### Preprocessing

In [40]:
%%time

#Converting text to lowercase
df['text'] = df['text'].apply(lambda s: s.lower())

#Selecting only numbers, alphabets, and #+_ 
df['text'] = df['text'].apply(lambda s: re.sub('[^0-9a-z ]','',s))

#Removing stop words
df.apply(lambda x: remove_stopwords(x['text']), axis = 1)

#Removing extra spaces
df['text'] = df['text'].apply(lambda s: s.strip())

Wall time: 3.16 s


### Merging all column labels together

In [41]:
#Creating a list with combination of each row across 4 columns
results = []
for i in range(len(df)):
    l=[]
    l.append(df.loc[i,'gender'])
    l.append(df.loc[i,'age'])
    l.append(df.loc[i,'topic'])
    l.append(df.loc[i,'sign'])
    results.append(l)
df['label'] = results

# #Subsetting only the text and label columns
# df = df[['text','label']]

In [42]:
df.head()

Unnamed: 0,index,id,gender,age,topic,sign,date,text,label
0,32564,3622069,female,23,indUnk,Aries,"08,July,2004",y i say like tatwan sth u wish to knowto meett...,"[female, 23, indUnk, Aries]"
1,562589,1157144,female,17,Student,Libra,"05,August,2004",ok so birthday san fran thing is saturday ok all,"[female, 17, Student, Libra]"
2,257166,2815932,male,26,Science,Sagittarius,"05,April,2004",here are your comments immortalised is that h...,"[male, 26, Science, Sagittarius]"
3,301715,3669029,female,13,indUnk,Libra,"20,July,2004",nbspnbsp you kno its really wierd how people l...,"[female, 13, indUnk, Libra]"
4,396918,3636814,female,39,Fashion,Pisces,"29,June,2004",ive been considering getting a bike no not a h...,"[female, 39, Fashion, Pisces]"


### Separating features and labels + Train-test split

In [43]:
#Selecting the feature column
features = df['text']
labels = df['label']

#Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.3, random_state = 42)

### Vectorize the features

A. Create a Bag of Words using count vectorizer  
i. Use ngram_range=(1, 2)  
ii. Vectorize training and testing features

In [44]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english',ngram_range=(1, 2),max_df=0.2,min_df=2)
#fit_transform on X_train
X_train_vector = cv.fit_transform(X_train)
#transform on X_test
X_test_vector = cv.transform(X_test)

Wall time: 1.49 s


b. Print the term-document matrix

In [45]:
#print the dtm
print(X_train_vector)

  (0, 28367)	1
  (0, 2067)	1
  (0, 6414)	1
  (0, 25767)	1
  (0, 4767)	1
  (0, 19398)	1
  (0, 33852)	1
  (0, 31859)	1
  (0, 606)	1
  (0, 3218)	1
  (0, 360)	1
  (0, 13955)	2
  (0, 28703)	1
  (0, 18049)	1
  (0, 20027)	1
  (0, 18890)	1
  (0, 2074)	1
  (0, 29690)	1
  (0, 18904)	1
  (0, 14028)	1
  (1, 25767)	1
  (1, 19079)	2
  (1, 2310)	1
  (1, 32211)	1
  (1, 23682)	1
  :	:
  (3499, 31589)	1
  (3499, 6781)	1
  (3499, 5817)	1
  (3499, 32840)	1
  (3499, 21502)	1
  (3499, 22837)	1
  (3499, 20478)	1
  (3499, 6563)	1
  (3499, 13171)	1
  (3499, 24687)	2
  (3499, 21808)	1
  (3499, 14318)	1
  (3499, 20280)	1
  (3499, 19648)	1
  (3499, 14002)	1
  (3499, 31631)	1
  (3499, 34131)	1
  (3499, 8367)	1
  (3499, 28675)	1
  (3499, 30750)	1
  (3499, 28226)	1
  (3499, 19965)	1
  (3499, 8149)	1
  (3499, 20991)	1
  (3499, 27514)	1


### 6. Create a dictionary to get the count of every label

In [46]:
columns=['gender','age','topic','sign']
x={}
for i in columns:
    x.update(df[i].value_counts())

In [47]:
x

{'male': 2510,
 'female': 2490,
 '17': 616,
 '24': 556,
 '16': 555,
 '23': 547,
 '25': 521,
 '26': 419,
 '27': 341,
 '15': 298,
 '14': 213,
 '34': 131,
 '33': 122,
 '35': 109,
 '13': 106,
 '36': 98,
 '37': 63,
 '38': 50,
 '45': 36,
 '39': 33,
 '40': 32,
 '43': 32,
 '42': 30,
 '46': 26,
 '41': 23,
 '48': 16,
 '44': 14,
 '47': 13,
 'indUnk': 1824,
 'Student': 1125,
 'Technology': 306,
 'Arts': 255,
 'Education': 223,
 'Communications-Media': 162,
 'Internet': 120,
 'Non-Profit': 105,
 'Engineering': 84,
 'Law': 64,
 'Publishing': 60,
 'Science': 54,
 'Religion': 44,
 'Accounting': 43,
 'Government': 43,
 'Consulting': 41,
 'Fashion': 37,
 'Advertising': 36,
 'Marketing': 33,
 'Banking': 30,
 'BusinessServices': 28,
 'Chemicals': 28,
 'Telecommunications': 28,
 'Military': 25,
 'RealEstate': 25,
 'Transportation': 19,
 'Sports-Recreation': 19,
 'Manufacturing': 18,
 'HumanResources': 18,
 'Biotech': 17,
 'Museums-Libraries': 16,
 'Automotive': 15,
 'Tourism': 15,
 'InvestmentBanking': 10,

### 7.Transform the labels
a. Convert your train and test labels using MultiLabelBinarizer

In [63]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

#Getting the one-hot-encoded output as an array
y_train_transform = mlb.fit_transform(y_train)
y_test_transform = mlb.transform(y_test)

### 8. Choose a classifier
a. Use a linear classifier of your choice, wrap it up in OneVsRestClassifier to train it on
every label

In [64]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver='lbfgs')
clf = OneVsRestClassifier(clf)

### 9. Fit the classifier, make predictions and get the accuracy 

In [65]:
%%time
clf.fit(X_train_vector, y_train_transform)

Wall time: 15.4 s


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

### Print the following
1. Accuracy score
2. F1 score
3. Average precision score
4. Average recall score  

In [74]:
from sklearn import metrics
#Using macro average as it gives equal weight to all the datapoints
print('Test Accuracy: ',metrics.accuracy_score(y_test_transform, clf.predict(X_test_vector)))
print('F1-Score using macro average is:',metrics.f1_score(y_test_transform, clf.predict(X_test_vector),average='macro'))
print('Average Precision using macro average is:',metrics.average_precision_score(y_test_transform, clf.predict(X_test_vector),average='macro'))
print('Average Recall using macro average is:',metrics.recall_score(y_test_transform, clf.predict(X_test_vector),average='macro'))

Test Accuracy:  0.0013333333333333333
F1-Score using macro average is: 0.03727682334499728
Average Precision using macro average is: nan
Average Recall using macro average is: 0.029288463269384737


In [76]:
#We can also print the classification report to display all these metrics at once
print(metrics.classification_report(y_test_transform, clf.predict(X_test_vector)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        33
           1       0.60      0.05      0.10        57
           2       0.19      0.05      0.08        80
           3       0.20      0.05      0.08       180
           4       0.34      0.11      0.16       177
           5       0.26      0.05      0.08       170
           6       0.12      0.02      0.03       156
           7       0.12      0.02      0.04       161
           8       0.25      0.03      0.05       130
           9       0.00      0.00      0.00       107
          10       0.50      0.02      0.05        41
          11       0.50      0.02      0.04        46
          12       0.00      0.00      0.00        40
          13       0.00      0.00      0.00        25
          14       0.00      0.00      0.00        15
          15       0.00      0.00      0.00        11
          16       0.00      0.00      0.00        12
          17       0.00    

### 10. Print true label and predicted label for any five examples 

In [71]:
#Printing the labels of the predicted values for first 5 examples
mlb.inverse_transform(clf.predict(X_test_vector[:5]))

[('female', 'indUnk'), ('male',), ('female',), ('female',), ('indUnk', 'male')]

In [69]:
#Printing the actual labels for the first 5 examples
mlb.inverse_transform(y_test_transform[:5])

[('15', 'Non-Profit', 'Scorpio', 'male'),
 ('26', 'Aries', 'Student', 'female'),
 ('23', 'Cancer', 'Student', 'female'),
 ('24', 'Student', 'Taurus', 'male'),
 ('14', 'Student', 'Virgo', 'male')]

Due to the large number of classes and the limitations of performance, we were not able to generate a model with a good accuracy. Using ensemble classifiers/neural networks along with a powerful GPU will enable us to use a larger dataset for training and might lead to better accuracies.

---------