###Load the data

In [56]:
import pandas as pd
df=pd.read_csv("/content/drive/MyDrive/AIML Projects/Statistical NLP project/blogtext.csv")

In [57]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


###Preprocesing Text

#####We've included the data, next steps would include preprocessing it based on,
1. Ensure no Null/Nan values are present 
2. Keep only text items
3. Normalize the input 
    a. remove stop words
    b. Lemmatize the words
    c. Stem the words
4. Tokenize the words

In [58]:
df_t = df[['text']].head(4000)

In [59]:
df_t.head()

Unnamed: 0,text
0,"Info has been found (+/- 100 pages,..."
1,These are the team members: Drewe...
2,In het kader van kernfusie op aarde...
3,testing!!! testing!!!
4,Thanks to Yahoo!'s Toolbar I can ...


In [60]:
df_t.isnull().sum()

text    0
dtype: int64

In [61]:
df_t.isna().sum()

text    0
dtype: int64

####Remove unwanted characters and convert this to lower case

In [62]:
#import regular expression
import re

In [63]:
df_t.text= df_t.text.apply(lambda x: re.sub('[^A-Za-z]+', ' ', x))

####Remove unwanted spaces

In [64]:
df_t.text = df_t.text.apply(lambda x: x.strip())

####Convert to lower case

In [65]:
df_t.text = df_t.text.apply(lambda x: x.lower())

####Remove Stopwords

In [66]:
import nltk
nltk.download('stopwords')#download stopwords dictionary from NLTK

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [67]:
from nltk.corpus import stopwords

In [68]:
stop_words = set(stopwords.words('english'))

In [69]:
df_t.text = df_t.text.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [70]:
#random check of any row
df_t['text'].iloc[3212]

'yay dubya coming visit england huge amounts people coming give great leader urllink warm welcome wait til gets urllink'

###Merging and creating multi label Y values 

In [71]:
df.columns

Index(['id', 'gender', 'age', 'topic', 'sign', 'date', 'text'], dtype='object')

In [106]:
df['age']=df['age'].astype(str)
#df_t['labels'] = df[['gender','age','topic','sign']].apply(lambda x: ','.join(x), axis = 1)
df_t['labels'] = df.apply(lambda row: [row['gender'].lower(), str(row['age']), row['topic'].lower(), row['sign'].lower()], axis=1)
df_t.head()

Unnamed: 0,text,labels,labels2
0,info found pages mb pdf files wait untill team...,"[male, 15, student, leo]","male , 15 , Student , Leo"
1,team members drewes van der laag urllink mail ...,"[male, 15, student, leo]","male , 15 , Student , Leo"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, student, leo]","male , 15 , Student , Leo"
3,testing testing,"[male, 15, student, leo]","male , 15 , Student , Leo"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, investmentbanking, aquarius]","male , 33 , InvestmentBanking , Aquarius"


###Normalization

In [15]:
#Stemming
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
  
stemmer = PorterStemmer()

words = word_tokenize(df_t.text)
  
for w in words:
    print(w, " : ", ps.stem(w))

LookupError: ignored

In [16]:
#Lemmatization
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    lemm = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
    return(" ".join(lemm)) 

df_t["text"] = df_t.text.apply(lemmatize_text)

LookupError: ignored

###Train_Test_Split

In [107]:
from sklearn.model_selection import train_test_split
X_train, x_test, y_train, y_test = train_test_split(df_t.text.values,df_t.labels.values, test_size=0.25, random_state=56 )

In [108]:
print(X_train.shape)
x_test.shape

(3000,)


(1000,)

In [109]:
y_train[2991]

['male', '15', 'student', 'aquarius']

###Vectorize features

####Tf_IDF Vectorizer

In [110]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer(ngram_range=(1,2),max_features=20000)

In [111]:
input_data=df_t.text.values
input_vectors=tf_idf.fit(input_data)
X_trainv=tf_idf.transform(X_train)
x_testv=tf_idf.transform(x_test)

In [112]:
print(X_trainv[2245])#,X_trainv.shape)
print(x_testv[984])#,x_testv.shape)

  (0, 18510)	0.39370048317955497
  (0, 10676)	0.45380258680428087
  (0, 9249)	0.4250021733051824
  (0, 8669)	0.4094321711369768
  (0, 7351)	0.31274563215882123
  (0, 4208)	0.43930827586495597
  (0, 19719)	0.12878808137950393
  (0, 19646)	0.04876470571257804
  (0, 18397)	0.21549724334747386
  (0, 17784)	0.06995608460834722
  (0, 17689)	0.046164974057414584
  (0, 17331)	0.13717783279087756
  (0, 17329)	0.056346073516481354
  (0, 16950)	0.11756716720049198
  (0, 16947)	0.05754185958724
  (0, 16812)	0.10404217656287547
  (0, 16217)	0.12878808137950393
  (0, 15447)	0.09058705058227486
  (0, 15312)	0.16400299521069317
  (0, 15280)	0.0833493374002035
  (0, 15259)	0.07132986975236581
  (0, 13949)	0.14116103378781816
  (0, 13948)	0.1084487269773085
  (0, 13698)	0.06713373902887564
  (0, 13141)	0.13392332060574683
  (0, 12860)	0.13392332060574683
  (0, 12433)	0.08702887911926452
  (0, 12272)	0.09223595102626964
  (0, 11749)	0.09572228980851774
  (0, 10953)	0.08200149760534658
  (0, 10630)	0.1248

####Count Vectorizer

In [113]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(ngram_range=(1,2),max_features=20000)

In [114]:
input_cvectors=cv.fit(input_data)
X_traincv=cv.transform(X_train)
x_testcv=cv.transform(x_test)

In [115]:
print(X_traincv.shape)
print(x_testcv.shape)

(3000, 20000)
(1000, 20000)


####Document term matrix

In [116]:
df_dtmtrain = pd.DataFrame(X_traincv.toarray(), columns=cv.get_feature_names())
print(df_dtmtrain.head())

   aa  aal  aaron  aba  abandon  ...  zodiac  zombie  zone  zoo  zoom
0   0    0      0    0        0  ...       0       0     0    0     0
1   0    0      0    0        0  ...       0       0     0    0     0
2   0    0      0    0        0  ...       0       0     0    0     0
3   0    0      0    0        0  ...       0       0     0    0     0
4   0    0      0    0        0  ...       0       0     0    0     0

[5 rows x 20000 columns]


In [117]:
df_dtmtest = pd.DataFrame(x_testcv.toarray(), columns=cv.get_feature_names())
print(df_dtmtest.head())
print(df_dtmtest.shape)

   aa  aal  aaron  aba  abandon  ...  zodiac  zombie  zone  zoo  zoom
0   0    0      0    0        0  ...       0       0     0    0     0
1   0    0      0    0        0  ...       0       0     0    0     0
2   0    0      0    0        0  ...       0       0     0    0     0
3   0    0      0    0        0  ...       0       0     0    0     0
4   0    0      0    0        0  ...       0       0     0    0     0

[5 rows x 20000 columns]
(1000, 20000)


###Transform Labels

####Dictionary to get the count of every label

In [118]:
import numpy as np
cv2=CountVectorizer(ngram_range=(1,1),min_df=1)
df_t['labels2']=np.nan
for i in range(len(df_t.labels)):
  df_t['labels2'].iloc[i]=' , '.join([str(item) for item in df_t.labels.iloc[i]])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [119]:
df_t.head(5)

Unnamed: 0,text,labels,labels2
0,info found pages mb pdf files wait untill team...,"[male, 15, student, leo]","male , 15 , student , leo"
1,team members drewes van der laag urllink mail ...,"[male, 15, student, leo]","male , 15 , student , leo"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, student, leo]","male , 15 , student , leo"
3,testing testing,"[male, 15, student, leo]","male , 15 , student , leo"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, investmentbanking, aquarius]","male , 33 , investmentbanking , aquarius"


In [120]:
labels_cv=cv2.fit(df_t.labels2.values)

In [121]:
dictionary={}
dictionary=cv2.vocabulary_
dictionary

{'14': 0,
 '15': 1,
 '16': 2,
 '17': 3,
 '23': 4,
 '24': 5,
 '25': 6,
 '26': 7,
 '27': 8,
 '33': 9,
 '34': 10,
 '35': 11,
 '36': 12,
 '37': 13,
 '39': 14,
 '41': 15,
 '44': 16,
 '45': 17,
 'accounting': 18,
 'aquarius': 19,
 'aries': 20,
 'arts': 21,
 'banking': 22,
 'businessservices': 23,
 'cancer': 24,
 'capricorn': 25,
 'communications': 26,
 'education': 27,
 'engineering': 28,
 'female': 29,
 'gemini': 30,
 'indunk': 31,
 'internet': 32,
 'investmentbanking': 33,
 'leo': 34,
 'libra': 35,
 'libraries': 36,
 'male': 37,
 'media': 38,
 'museums': 39,
 'non': 40,
 'pisces': 41,
 'profit': 42,
 'recreation': 43,
 'sagittarius': 44,
 'science': 45,
 'scorpio': 46,
 'sports': 47,
 'student': 48,
 'taurus': 49,
 'technology': 50,
 'virgo': 51}

####MultiClass labels into a matix of 1-hot using Multi-label Binarizer

As the Y value here is composed of multiple classes for each input example, we cannot apply one hot as it to compare that with prediction, for this reason multilabelbinarizer helps
MultiLabelBinarizer from sklearn is used to convert one hot of each unique classes in the input.
With this as an actual input we can do prediction comparison

In [122]:
from sklearn.preprocessing import MultiLabelBinarizer

In [123]:
#We need the list of classes in the input to convert them to One-hot form using MLB
#Creating list of unique classes
labels_classes=[]

for key in cv2.vocabulary_.keys():
  labels_classes.append(key)

In [125]:
labels_classes

['male',
 '15',
 'student',
 'leo',
 '33',
 'investmentbanking',
 'aquarius',
 'female',
 '14',
 'indunk',
 'aries',
 '25',
 'capricorn',
 '17',
 'gemini',
 '23',
 'non',
 'profit',
 'cancer',
 'banking',
 '37',
 'sagittarius',
 '26',
 '24',
 'scorpio',
 '27',
 'education',
 '45',
 'engineering',
 'libra',
 'science',
 '34',
 '41',
 'communications',
 'media',
 'businessservices',
 'sports',
 'recreation',
 'virgo',
 'taurus',
 'arts',
 'pisces',
 '44',
 '16',
 'internet',
 'museums',
 'libraries',
 'accounting',
 '39',
 '35',
 'technology',
 '36']

In [126]:
#Lets pass all the classes to MLB and create their function
#Once 1-hot library is created, fit transform the train data to get their ony hot labels
mlb = MultiLabelBinarizer(classes=labels_classes)
labels_mlb=mlb.fit(df_t.labels.values)
#train_labels = mlb.transform(y_train)

In [127]:
train_labels = mlb.transform(y_train)
test_labels= mlb.transform(y_test)

  .format(sorted(unknown, key=str)))


In [128]:
mlb.classes_

array(['male', '15', 'student', 'leo', '33', 'investmentbanking',
       'aquarius', 'female', '14', 'indunk', 'aries', '25', 'capricorn',
       '17', 'gemini', '23', 'non', 'profit', 'cancer', 'banking', '37',
       'sagittarius', '26', '24', 'scorpio', '27', 'education', '45',
       'engineering', 'libra', 'science', '34', '41', 'communications',
       'media', 'businessservices', 'sports', 'recreation', 'virgo',
       'taurus', 'arts', 'pisces', '44', '16', 'internet', 'museums',
       'libraries', 'accounting', '39', '35', 'technology', '36'],
      dtype=object)

In [129]:
test_labels[433]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0])

In [130]:
y_test[433]

['male', '35', 'technology', 'aries']

###Model Building

Linear classifiers are mostly binary classifiers

As we are dealing with Multi class classification we need to use a classifier which will handle that.

With One-vs-Rest classifier the multi class classification is broken into a series of Binary classifications, we can use a Linear model (logistic regression) for Binary Classification

In [131]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [132]:
print(X_traincv.shape)
print(x_testcv.shape)
print(train_labels.shape)
print(test_labels.shape)

(3000, 20000)
(1000, 20000)
(3000, 52)
(1000, 52)


In [134]:
clf = LogisticRegression(solver = 'sag', max_iter=10000)
clf = OneVsRestClassifier(clf)

In [135]:
training=clf.fit(X_traincv,train_labels)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


###Prediction

In [136]:
prediction=clf.predict(x_testcv)

In [137]:
clf.score(x_testcv,test_labels)

0.573

In [138]:
from sklearn.metrics import classification_report

In [146]:
cr=classification_report(test_labels,prediction,target_names= sorted(mlb.classes_),zero_division=1)

In [147]:
print(cr)

                   precision    recall  f1-score   support

               14       0.92      0.95      0.94       788
               15       0.81      0.43      0.57        90
               16       0.85      0.44      0.58       126
               17       1.00      0.17      0.29        24
               23       1.00      0.30      0.46        30
               24       1.00      0.43      0.60        21
               25       0.79      0.35      0.48        78
               26       0.80      0.70      0.75       212
               27       0.89      0.24      0.38        33
               33       0.62      0.40      0.49       134
               34       0.86      0.91      0.88       607
               35       0.62      0.11      0.19        45
               36       0.88      0.35      0.50        20
               37       0.77      0.37      0.50        46
               39       0.00      0.00      0.00        10
               41       0.33      0.09      0.14       