# 1. check if adding a feature statistically significant increase the prediction accuracy.


In [74]:
# goal 1: use many other classifier to do text classification
# goal 2: show how to text the value of a feature rigourously

In [75]:
import nltk
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
# BernoulliNB works when the features are binary value
# MultinomialNB works when the features are discrete
#when the feature is not binary or multinimial bydefault GaussianNB is implemented.

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.corpus import names
import random
import statistics
from scipy import stats

In [76]:
# defining the object set
allname=[(name,'male') for name in names.words('male.txt')]+[(name,'female') for name in names.words('female.txt')]

In [77]:
# define the baseline feature function
def feature1(text):
    return {'last_letter':text[-1]}

In [78]:
# define the testing feature
def feature2(text):
    return {'last_letter':text[-1], 'first_letter':text[0]}

In [79]:
featureset1=[(feature1(name),clas) for name,clas in allname]
featureset2=[(feature2(name),clas) for name,clas in allname]

In [80]:
trainsize=int(0.8*len(featureset1))

In [81]:
# define classifier objects
BNB=SklearnClassifier(BernoulliNB())
MNB=SklearnClassifier(MultinomialNB())
L1=SklearnClassifier(LogisticRegression(solver='newton-cg'))
L2=SklearnClassifier(LogisticRegression(solver='liblinear'))
SVC=SklearnClassifier(SVC(gamma='auto'))
LSVC=SklearnClassifier(LinearSVC())
NuSVC=SklearnClassifier(NuSVC(gamma='auto'))

In [100]:
# define a function to test accuracy
def accuracy(featureset):
    accuracy=[]
    for i in range(10):
#         print(i)
        random.shuffle(featureset)
        train=featureset[:trainsize]
        test=featureset[trainsize:]
        nltkNB=nltk.NaiveBayesClassifier.train(train)
        nltkDT=nltk.DecisionTreeClassifier.train(train)
        BNB.train(train);MNB.train(train);L1.train(train);L2.train(train)
        SVC.train(train);LSVC.train(train);NuSVC.train(train)
        # we report accuracy using a list of tuple
        # each tuple represent each iteration
        # [(acc1,acc2,...,acc9),(acc1,acc2,...,acc9),(acc1,acc2,...,acc9)....]
        acc1=nltk.classify.accuracy(nltkNB,test)
        acc2=nltk.classify.accuracy(nltkDT,test)
        acc3=nltk.classify.accuracy(BNB,test)
        acc4=nltk.classify.accuracy(MNB,test)
        acc5=nltk.classify.accuracy(L1,test)
        acc6=nltk.classify.accuracy(L2,test)
        acc7=nltk.classify.accuracy(SVC,test)
        acc8=nltk.classify.accuracy(LSVC,test)
        acc9=nltk.classify.accuracy(NuSVC,test)
        accuracy.append((acc1,acc2,acc3,acc4,acc5,acc6,acc7,acc8,acc9))
    print([statistics.mean(x) for x in accuracy])
#     return accuracy

In [101]:
accuracy_baseline=accuracy(featureset1)
accuracy_testing=accuracy(featureset2)
print(accuracy_baseline)

[0.7642122928466542, 0.7564505978602895, 0.7575694007412069, 0.7498077057548422, 0.7518355359765051, 0.763373190685966, 0.759876931683099, 0.7601566324033284, 0.7656807216278582, 0.7671491504090623]
[0.7778477029578351, 0.766869449688833, 0.7786868051185232, 0.7879868540661492, 0.7656107964478008, 0.7563107475001748, 0.7736521921543948, 0.7766589748968603, 0.7644919935668835, 0.7742115935948535]
None


In [102]:
print(accuracy_testing)

None


In [98]:
# draw inference on if the added feature statistically significantly improve the predictive power
# featureset 1: [(acc1,acc2,...,acc9),(acc1,acc2,...,acc9),(acc1,acc2,...,acc9)....]
# featuerset 2: [(acc1,acc2,...,acc9),(acc1,acc2,...,acc9),(acc1,acc2,...,acc9)....]

In [99]:
for i in range(9):
    acc_baseline=[x[i] for x in accuracy_baseline]
    acc_testing=[x[i] for x in accuracy_testing]
    ttest=stats.ttest_ind(acc_baseline,acc_testing,equal_var=False)#paired t-test to calculate if the acc from featureset1 are lower/higher than featureset1 
    print(ttest)
#A p-value less than 0.05 (typically ≤ 0.05) is statistically significant. ... 
#A p-value higher than 0.05 (> 0.05) is not statistically significant and 
#indicates strong evidence for the null hypothesis. This means we retain the null hypothesis 
#and reject the alternative hypothesis.

#eg: pvalue=0.9781324701546347 is very large. not statistically significant

Ttest_indResult(statistic=-3.144519381413715, pvalue=0.005675767514500799)
Ttest_indResult(statistic=-4.185171494118548, pvalue=0.000617477877669631)
Ttest_indResult(statistic=-1.8610658173765826, pvalue=0.07929739174009337)
Ttest_indResult(statistic=-3.1788065244158537, pvalue=0.005262574801584489)
Ttest_indResult(statistic=-3.806503546591098, pvalue=0.0013118968975974134)
Ttest_indResult(statistic=-3.806503546591098, pvalue=0.0013118968975974134)
Ttest_indResult(statistic=-0.027797445995329642, pvalue=0.9781324701546347)
Ttest_indResult(statistic=-3.721408969022572, pvalue=0.0015633335250358291)
Ttest_indResult(statistic=0.20417215141764028, pvalue=0.8405096865287742)


# 2. variable number of arguments


In [108]:
def addup(*numbers): # define a series of inputs
    summation=0
    for each in numbers:
        summation+=each
    return summation
# the variable numbers is a tuple

In [109]:
print(addup(4,3,2))

9


In [105]:
# another example
class employee:
    
    def __init__(self,*sales):
        self.sales=sales
    
    def printallsale(self):
        print(self.sales)
        
emp1=employee(100,2000,400,500)
emp1.printallsale()

(100, 2000, 400, 500)


In [None]:
class employee:
    
    def __init__(self,*sales):
        self.sales=sales
    
    def printallsale(self):
        print(self.sales)
        
emp1=employee(100,2000,400,500)
emp1.printallsale()

# 3. build a cooperate classifier


In [None]:
# cooperative classifier
# suppose we have 9 classifiers. If 6 votes for male, and 3 votes for female, 
# then we classify this object to male with a 6/9=66.7% confidence
#sometimes the prediction results of the classifiers are different. So create a voting mechansim.

In [110]:
import nltk
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
# BernoulliNB works when the features are binary value
# MultinomialNB works when the features are discrete
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.corpus import names
import statistics
import random

In [111]:
# create classifier object
BNB=SklearnClassifier(BernoulliNB())
MNB=SklearnClassifier(MultinomialNB())
L1=SklearnClassifier(LogisticRegression(solver='newton-cg'))
L2=SklearnClassifier(LogisticRegression(solver='liblinear'))
SVC=SklearnClassifier(SVC(gamma='auto'))
LSVC=SklearnClassifier(LinearSVC())
NuSVC=SklearnClassifier(NuSVC(gamma='auto'))

In [116]:
# train classifiers
allname=[(name,'male') for name in names.words('male.txt')]+[(name,'female') for name in names.words('female.txt')]
def feature(text):
    return {'last_letter':text[-1], 'first_letter':text[0]}
featureset=[(feature(name),clas) for name,clas in allname]
random.shuffle(featureset)
trainsize=int(0.8*len(featureset))
train=featureset[:trainsize]
test=featureset[trainsize:]#list of tuple. 1st element is he feature and 2nd element is the class.
nltkNB=nltk.NaiveBayesClassifier.train(train)
nltkDT=nltk.DecisionTreeClassifier.train(train)
BNB.train(train);MNB.train(train);L1.train(train);L2.train(train)
SVC.train(train);LSVC.train(train);NuSVC.train(train)

<SklearnClassifier(NuSVC(break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
      max_iter=-1, nu=0.5, probability=False, random_state=None, shrinking=True,
      tol=0.001, verbose=False))>

In [117]:
# define the cooperative classifier class
class Cooperative_classifier:
    
    def __init__(self,*classifiers):
        self.classifier=classifiers
        self.vote=None
        
    def classify(self,feature):#input feature
        vote=[]
        for i in self.classifier:
            res=i.classify(feature)
            vote.append(res)
        self.vote=vote
        return statistics.mode(vote)#mode is the most frequent element in the list. for confidence we take out the most common element.
    
    def confidence(self):
        return self.vote.count(statistics.mode(self.vote))/len(self.vote)


In [118]:
C=Cooperative_classifier(nltkNB,nltkDT,BNB,MNB,L1,L2,SVC,LSVC,NuSVC)
print(C)

In [120]:
# calcualte the accuracy for the cooperative classifier
num_correct=0
num_attempt=0
for i in test:
    num_attempt+=1
    feature=i[0]
    true_class=i[1]
    predicted=C.classify(feature)
    if predicted==true_class:
        num_correct+=1

print(num_correct/num_attempt)

0.788546255506608
