In [None]:
!pip install pattern
# To evaluate sentence modality we use pattern library
# It gives score for a sentence between -1 to +1



In [None]:
import numpy as np
import pandas as pd
import nltk 
import re
import scipy.sparse as sp

from textblob import TextBlob
from pattern.en import modality
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
nltk.download('all')
# Downloading all the packages within NLTK

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Package cess_cat is already up-to-date!
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Package cess_esp is already up-to-date!
[nltk_data]    | Downloading packag

True

# Dataset

In [None]:
filePath = 'https://raw.githubusercontent.com/serenpa/Blog-Credibility-Corpus/main/blog_credibility_dataset.tsv'
# Convert accented characters to ASCII characters - encoding
df = pd.read_csv(filePath, sep='\t', encoding='latin1')
# Decoding to special characters
df['sentence_text'] = df.sentence_text.apply(lambda x:x.encode('latin1').decode('cp1252'))
# Removing empty spaces and special characters using regex
cleanParser=re.compile(r'(\w+)')
# Replacing multiple spaces or special characters to single space in a single sentence plus lowering characters
df['cleanSentence'] = df.sentence_text.apply(lambda x: ' '.join([i.lower() for i in cleanParser.findall(x.strip())]))
df.head()

Unnamed: 0,ID,document_id,sentence_id,sentence_text,Claim,Reasoning,Conclusion,Citation,Code Snippet,Events,Experience,Reference to Table / Image,Statistics / Data,Other,cleanSentence
0,1,253,1,"Stuff Here’s some stuff I’m thinking about, in...",0,0,0,0,0,0,0,0,0,0,stuff here s some stuff i m thinking about in ...
1,2,253,2,VNC vs.,0,0,0,0,0,0,0,0,0,0,vnc vs
2,3,253,3,Windows Terminal Services We regularly use two...,1,0,0,0,0,0,0,0,0,0,windows terminal services we regularly use two...
3,4,253,4,Windows Terminal Services from Microsoft (now ...,0,0,0,0,0,0,0,0,0,0,windows terminal services from microsoft now c...
4,5,253,5,"If you have a choice in the matter, Windows Te...",1,0,0,0,0,0,0,0,0,0,if you have a choice in the matter windows ter...


In [None]:
print('#Unique document Ids',df.document_id.nunique())

#Unique document Ids 234


In [None]:
df.Experience.value_counts()

0    17406
1     2590
Name: Experience, dtype: int64

# Features

In [None]:
def getSubjectivity(sentence):
  ''' Subjective sentences generally refer to personal opinion, emotion or judgment whereas objective refers to factual information'''
  textBlobObj = TextBlob(sentence)
  # rounding subjectivity score to 4 decimals
  return round(textBlobObj.subjectivity, 4)

# Feature 1-4 - Sentiment Intensity Compound, Positive, Negative, Neutral
'''The Compound score is a metric that calculates the sum of all the
 lexicon ratings which have been normalized between -1(most extreme negative) and +1 (most extreme positive).
 positive sentiment : (compound score >= 0.05) 
 neutral sentiment : (compound score > -0.05) and (compound score < 0.05) 
 negative sentiment : (compound score <= -0.05) '''

def getSentimentIntensity(sentence):
  vad = SentimentIntensityAnalyzer()
  scores = vad.polarity_scores(sentence)
  return scores.get('compound'), scores.get('neg'), scores.get('neu'), scores.get('pos')

# Feature 5 - Retrieving POS  tags from the sentence
def getPOSTags(sentence):
  from collections import Counter
  words = nltk.word_tokenize(sentence)
  pos_tags = nltk.pos_tag(words)
  return pos_tags, Counter(tag for word, tag in pos_tags).items()

# Feature 6 - Named entity recognition
def identifyNER(ptags):
  ners = nltk.ne_chunk(ptags, binary=True)
  words = [i[0] for i in ners if isinstance(i, nltk.Tree)]
  return ' '.join(i[0] for i in words) if words else np.nan

# Feaature 7 - Count of I
def countIs(sentence):
  return len(re.findall(r'i | i ', sentence.lower()))

# Feature 8 - Modality score for given sentence
'''
Modality is a semantic notion that is related to speaker’s opinion and belief 
about the event’s believability. Modality in English can be achieved by modal verbs (will/would)
'''
def getModality(sentence):
  try:
    return modality(sentence)
  except RuntimeError as e:
    return 0

In [None]:
# subjectivity
df['subjectivity'] = df.apply(lambda x: getSubjectivity(x.cleanSentence), axis='columns')

# Feature 9 - Word count
df['wordCount'] = df.cleanSentence.apply(lambda x: len(x.split(' ')))

# POS tags
df[['pos_tags', 'pos_Density']] = df.apply(lambda x: getPOSTags(x.cleanSentence), axis='columns', result_type='expand')

# Sentiment intensity
df[['compoundIntensity', 'negativeIntensity', 'neutralIntensity', 'positiveIntensity']] = df.apply(lambda x: \
                                          getSentimentIntensity(x.cleanSentence), axis='columns', result_type='expand')

# NER recognition
df['ner_Density'] = df.pos_tags.apply(identifyNER)

# Count of I
df['i_count'] = df.cleanSentence.apply(countIs)

df['ner_count'] = df.ner_Density.apply(lambda x: 0 if isinstance(x, float) else len(x.split(' ')))

# Feature 10 - Char count
df['char_count'] = df.cleanSentence.apply(lambda x:len(x))
# Feature 11 - Average word length
df['avg_word_len'] = df.cleanSentence.apply(lambda x: np.mean([len(i) for i in x.split(' ')]))
df['modality'] = df.cleanSentence.apply(getModality)

# Features 12 - POS count
df['NN_Count'] = df.pos_Density.apply(lambda x: dict(x).get('NN',0))
df['IN_Count'] = df.pos_Density.apply(lambda x: dict(x).get('IN',0))
df['DT_Count'] = df.pos_Density.apply(lambda x: dict(x).get('DT',0))
df['JJ_Count'] = df.pos_Density.apply(lambda x: dict(x).get('JJ',0))
df['NNS_Count'] = df.pos_Density.apply(lambda x: dict(x).get('NNS',0))
df['PRP_Count'] = df.pos_Density.apply(lambda x: dict(x).get('PRP',0))
df['RB_Count'] = df.pos_Density.apply(lambda x: dict(x).get('RB',0))
df['VB_Count'] = df.pos_Density.apply(lambda x: dict(x).get('VB',0))
df['VBP_Count'] = df.pos_Density.apply(lambda x: dict(x).get('VBP',0))
df['TO_Count'] = df.pos_Density.apply(lambda x: dict(x).get('TO',0))
df['VBZ_Count'] = df.pos_Density.apply(lambda x: dict(x).get('VBZ',0))
df['CC_Count'] = df.pos_Density.apply(lambda x: dict(x).get('CC',0))
df['VBD_Count'] = df.pos_Density.apply(lambda x: dict(x).get('VBD',0))
df['VBG_Count'] = df.pos_Density.apply(lambda x: dict(x).get('VBG',0))
df['VBN_Count'] = df.pos_Density.apply(lambda x: dict(x).get('VBN',0))
df['CD_Count'] = df.pos_Density.apply(lambda x: dict(x).get('CD',0))
df.head()

Unnamed: 0,ID,document_id,sentence_id,sentence_text,Claim,Reasoning,Conclusion,Citation,Code Snippet,Events,Experience,Reference to Table / Image,Statistics / Data,Other,cleanSentence,subjectivity,wordCount,pos_tags,pos_Density,compoundIntensity,negativeIntensity,neutralIntensity,positiveIntensity,ner_Density,i_count,ner_count,char_count,avg_word_len,modality,NN_Count,IN_Count,DT_Count,JJ_Count,NNS_Count,PRP_Count,RB_Count,VB_Count,VBP_Count,TO_Count,VBZ_Count,CC_Count,VBD_Count,VBG_Count,VBN_Count,CD_Count
0,1,253,1,"Stuff Here’s some stuff I’m thinking about, in...",0,0,0,0,0,0,0,0,0,0,stuff here s some stuff i m thinking about in ...,0.3333,13,"[(stuff, NN), (here, RB), (s, VBZ), (some, DT)...","((NN, 4), (RB, 1), (VBZ, 1), (DT, 2), (JJ, 2),...",-0.296,0.196,0.804,0.0,,1,0,65,4.076923,0.25,4,2,2,2,0,0,1,0,0,0,1,0,0,1,0,0
1,2,253,2,VNC vs.,0,0,0,0,0,0,0,0,0,0,vnc vs,0.0,2,"[(vnc, NN), (vs, NN)]","((NN, 2))",0.0,0.0,1.0,0.0,,0,0,6,2.5,1.0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,253,3,Windows Terminal Services We regularly use two...,1,0,0,0,0,0,0,0,0,0,windows terminal services we regularly use two...,0.1385,15,"[(windows, NNS), (terminal, JJ), (services, NN...","((NNS, 5), (JJ, 1), (PRP, 1), (RB, 2), (VBP, 1...",0.0,0.0,1.0,0.0,,0,0,102,5.866667,0.625,2,1,0,1,5,1,2,0,1,1,0,0,0,0,0,1
3,4,253,4,Windows Terminal Services from Microsoft (now ...,0,0,0,0,0,0,0,0,0,0,windows terminal services from microsoft now c...,0.3583,39,"[(windows, NNS), (terminal, JJ), (services, NN...","((NNS, 3), (JJ, 5), (IN, 7), (RB, 2), (VBN, 2)...",0.3818,0.0,0.934,0.066,,0,0,220,4.666667,0.333333,8,7,2,5,3,1,2,0,0,0,3,1,1,0,2,1
4,5,253,5,"If you have a choice in the matter, Windows Te...",1,0,0,0,0,0,0,0,0,0,if you have a choice in the matter windows ter...,0.5,18,"[(if, IN), (you, PRP), (have, VBP), (a, DT), (...","((IN, 3), (PRP, 1), (VBP, 1), (DT, 2), (NN, 2)...",0.4588,0.0,0.789,0.211,,0,0,96,4.388889,0.75,2,3,2,2,2,1,1,0,1,0,2,0,0,0,0,1


# Importing Models

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

## Train Test Split

In [None]:
# Custom feature list
customFeatures = ['wordCount', 'subjectivity','compoundIntensity','negativeIntensity',
       'neutralIntensity', 'positiveIntensity','i_count',
       'ner_count', 'char_count', 'avg_word_len','modality',
       'NN_Count', 'IN_Count', 'DT_Count', 'JJ_Count', 'NNS_Count', 'PRP_Count',
       'RB_Count', 'VB_Count', 'VBP_Count', 'TO_Count', 'VBZ_Count',
       'CC_Count', 'VBD_Count','VBG_Count', 'VBN_Count', 'CD_Count']


splitColumns = ['sentence_text'] + customFeatures
samSize = len(df[df.Experience==1])
df_sampled = pd.concat([df[df.Experience==0].sample(samSize, random_state=150), df[df.Experience==1]])

# shuffling the combined dataset of balanced data i-e experience and non experience
df_sampled = df_sampled.sample(frac=1, random_state=50).reset_index(drop=True)
scaler = MinMaxScaler()
df_exp = pd.DataFrame(scaler.fit_transform(df_sampled[customFeatures]), columns=customFeatures)
df_exp['sentence_text']=df_sampled['sentence_text']
df_exp['Experience']=df_sampled['Experience']

# train test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(df_exp[splitColumns], df_exp.Experience, \
                                                    test_size=0.2, random_state=10, stratify=df_exp.Experience)


In [None]:
# Creating train & test for 3 different combinations TF-IDF, only Custom features, TF-IDF + Custom features

vecModel = TfidfVectorizer(analyzer='word', stop_words='english')
train = vecModel.fit_transform(X_train.sentence_text)
test = vecModel.transform(X_test.sentence_text)

# Stacking TF-IDF sparse matrix with custom features
train_2 = sp.hstack([train, X_train[customFeatures]])
test_2 = sp.hstack([test, X_test[customFeatures]])

#Only custom features
train_3 = X_train[customFeatures]
test_3 =  X_test[customFeatures]

# Models

In [None]:
models = {'SVM':SVC(C=1, kernel='linear', random_state=10),\
          'Random Forest': RandomForestClassifier(n_estimators=100, min_samples_split=5),\
          'Decision Tree': DecisionTreeClassifier(random_state=10), \
          'KNN': KNeighborsClassifier(n_neighbors=2),\
          'Naive Bayes': MultinomialNB()}

## Only with TFIDF

In [None]:
rslt = list()
for name, m in models.items():
  m.fit(train, y_train)
  pred = m.predict(test)
  # Gives clsasification report which gives Accuracy, precision, recall and f1
  report = classification_report(y_test, pred, output_dict=True)
  
  # To format in the dataframe for experience records, we use get method
  rslt.append({'Model': name,\
               'Accuracy': round(accuracy_score(y_test, pred)*100, 2),\
               'Precision': round(report.get('1').get('precision')*100, 2),\
               'Recall': round(report.get('1').get('recall')*100, 2),\
               'F1': round(report.get('1').get('f1-score')*100, 2)})

df_classifier = pd.DataFrame(rslt)
df_classifier.sort_values(by='Accuracy', ascending=False, inplace=True)
df_classifier.reset_index(drop=True, inplace=True)
df_classifier

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Naive Bayes,66.7,65.59,70.27,67.85
1,SVM,65.15,66.25,61.78,63.94
2,Random Forest,65.15,67.1,59.46,63.05
3,Decision Tree,59.56,59.88,57.92,58.88
4,KNN,50.39,61.11,2.12,4.1


## Only Custom Features

In [None]:
rslt = list()
for name, m in models.items():
  
  m.fit(train_3, y_train)
  pred = m.predict(test_3)
  report = classification_report(y_test, pred, output_dict=True)
  
  rslt.append({'Model': name,\
               'Accuracy': round(accuracy_score(y_test, pred)*100, 2),\
               'Precision': round(report.get('1').get('precision')*100, 2),\
               'Recall': round(report.get('1').get('recall')*100, 2),\
               'F1': round(report.get('1').get('f1-score')*100, 2)})
df_classifier = pd.DataFrame(rslt)
df_classifier.sort_values(by='Accuracy', ascending=False, inplace=True)
df_classifier.reset_index(drop=True, inplace=True)
df_classifier

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Naive Bayes,69.98,74.13,61.39,67.16
1,Random Forest,69.88,71.64,65.83,68.61
2,SVM,69.69,77.72,55.21,64.56
3,Decision Tree,59.94,59.66,61.39,60.51
4,KNN,59.17,65.37,39.0,48.85


## TFIDF + Custom features

In [None]:
rslt = list()
for name, m in models.items():
  m.fit(train_2, y_train)
  pred = m.predict(test_2)
  report = classification_report(y_test, pred, output_dict=True)
  
  rslt.append({'Model': name,\
               'Accuracy': round(accuracy_score(y_test, pred)*100, 2),\
               'Precision': round(report.get('1').get('precision')*100, 2),\
               'Recall': round(report.get('1').get('recall')*100, 2),\
               'F1': round(report.get('1').get('f1-score')*100, 2)})
df_classifier = pd.DataFrame(rslt)
df_classifier.sort_values(by='Accuracy', ascending=False, inplace=True)
df_classifier.reset_index(drop=True, inplace=True)
df_classifier

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,SVM,72.3,74.73,67.37,70.86
1,Random Forest,71.91,73.6,68.34,70.87
2,Naive Bayes,71.04,69.82,74.13,71.91
3,Decision Tree,64.29,64.45,63.71,64.08
4,KNN,57.72,71.98,25.29,37.43
