In [1]:
import numpy as np
import pandas as pd

## Load the Data Set

In [2]:
df=pd.read_csv("blogtext.csv")
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [3]:
df.shape

(681284, 7)

#### Check if there is any null value, and get the total count.

In [4]:
df.isnull().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [5]:
df=df.head(100)

## Preprocess rows of the “text” column
- a.Remove unwanted characters
- b.Convert text to lowercase
- c.Remove unwanted spaces
- d.Remove stopwords

In [6]:
# Select only alphabets
import re
df.text = df.text.apply(lambda x: re.sub('[^A-Za-z]+', ' ', x))

# Convert text to lowercase
df.text = df.text.apply(lambda x: x.lower())

# Strip unwanted spaces
df.text = df.text.apply(lambda x: x.strip())

# Remove stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
df.text = df.text.apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))

[nltk_data] Downloading package stopwords to /Users/luv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
df.text[12]

'last night pretty fun mostly company kept recently met couple finance types yeouido hard referred korea wall street spoke pretty good english rarity yeouido everywhere korea studied outside korea deal international business still brutal canadian accent made pretty tough figure saying sometimes one time accent got way though went restaurant guy junseok gal named hye kyung asked like deok wrote heard thought meant dog eat called bluff said sure let go kind dog oh deok deok ya dog deok said figured meant duck said oh oh ri duck gay dog speak korean know food great went obligatory ee cha thanks hye kyung correction second round koreans never go one place eat drink usually wander streets go three four five places couple weeks ago hongdae university bar district went places san nak ji living octopus restaurant old rock cool bar noraebang korean word karaoke finally clubnb noise basement wee hours morning personally prefer kangnam version clubnb since hongdae went one coolest thing bar hoppi

## Merge the label columns

In [9]:
df['labels'] = df.apply(lambda row: [row['gender'], str(row['age']), row['topic'], row['sign']], axis=1)

### Select only required columns from your dataframe

In [10]:
df = df[['text','labels']]

In [11]:
df.shape

(100, 2)

### Print final dataframe¶

In [12]:
df.head()

Unnamed: 0,text,labels
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


## Create training and testing data

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.text.values, df.labels.values, test_size=0.20, random_state=42)

## Vectorize the data
- a.Create a Bag of Words using count vectorizer
    - i.Use ngram_range=(1, 2)
    - ii.Vectorize training and testing featuresb.
- Print the term-document matrix


### Create Bag of Words


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))

In [15]:
X_train_bow = vectorizer.fit_transform(X_train)

In [16]:
X_test_bow = vectorizer.transform(X_test)

#### Have a look at some feature names

In [17]:
vectorizer.get_feature_names()[:5]



['aaldering', 'aaldering urllink', 'aarde', 'aarde maak', 'abandons']

#### Print term-document matrix

In [18]:
X_train_bow.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### Create a dictionary to get label counts

In [19]:
label_counts = dict()
for labels in df.labels.values:
    for label in labels:
        if label in label_counts:
            label_counts[label] += 1
        else:
            label_counts[label] = 1

#### Print the dictionary

In [20]:
label_counts

{'male': 74,
 '15': 4,
 'Student': 7,
 'Leo': 4,
 '33': 70,
 'InvestmentBanking': 70,
 'Aquarius': 70,
 'female': 26,
 '14': 21,
 'indUnk': 23,
 'Aries': 21,
 '25': 2,
 'Capricorn': 2,
 '17': 3,
 'Gemini': 3}

## Multi label binarizer

In [21]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=sorted(label_counts.keys()))
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

## Classifier

In [22]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver='lbfgs')
clf = OneVsRestClassifier(clf)

### Fit the classifier

In [23]:
clf.fit(X_train_bow, y_train)

OneVsRestClassifier(estimator=LogisticRegression())

## Make predictions

In [24]:
predicted_labels = clf.predict(X_test_bow)
predicted_scores = clf.decision_function(X_test_bow)

### Get inverse transform for predicted labels and test labels

In [25]:
pred_inversed = mlb.inverse_transform(predicted_labels)
y_test_inversed = mlb.inverse_transform(y_test)

### Print some samples

In [26]:
for i in range(5):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_test[i],
        ','.join(y_test_inversed[i]),
        ','.join(pred_inversed[i])
    ))

Title:	amazing really every season sole still love always never backed away matter unbearable weather hard find people loyal enough go lucky guess seasons sole summer everything well life going smoothly bumps road fall colors fall life falls apart unlike leaf float crash hard winter rain starts fall life completely torn peaces bare winter harshest seasons emotional blizzard spring life starts anew gather often use support weak point life begins blossom
True labels:	14,Aries,female,indUnk
Predicted labels:	female


Title:	busy entertaining british friend mine originally manchester k l kuala lumpur basically ruined asia see western guy comes hardly go back life west know two aussie brothers know thing work investment banks seoul shanghai speak respective languages life good effect korean women go places like vancouver bear leave place either anyways business partner aussie town first time went nights also american dutch friend mine koreans wanted go couple nights business like said busy 

## Calculate accuracy

In [27]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

def print_evaluation_scores(y_val, predicted):
    print('Accuracy score: ', accuracy_score(y_val, predicted))
    print('F1 score: ', f1_score(y_val, predicted, average='micro'))
    print('Average precision score: ', average_precision_score(y_val, predicted, average='micro'))
    print('Average recall score: ', recall_score(y_val, predicted, average='micro'))

In [28]:
print('Bag-of-words')
print_evaluation_scores(y_test, predicted_labels)

Bag-of-words
Accuracy score:  0.55
F1 score:  0.7611940298507462
Average precision score:  0.69875
Average recall score:  0.6375
