# Лабораторная №6. Выбор признаков

#### Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import strip_accents_ascii,TfidfVectorizer
from string import ascii_letters, whitespace
import nltk
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest,chi2,RFE,SelectFpr
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')

### Dataset

In [2]:
data = pd.read_csv("SMS.tsv", delimiter="\t")
data

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
data.text = data.text.apply(lambda msg: ''.join(filter(lambda c: c in ascii_letters + whitespace, strip_accents_ascii(msg).lower())))
data

Unnamed: 0,class,text
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in a wkly comp to win fa cup final...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,spam,this is the nd time we have tried contact u u...
5568,ham,will u b going to esplanade fr home
5569,ham,pity was in mood for that soany other suggest...
5570,ham,the guy did some bitching but i acted like id ...


In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tema\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
Y = pd.get_dummies(data['class']).ham
tfidf = TfidfVectorizer(max_features=1500, stop_words=stopwords.words('english'))
X = pd.DataFrame(data=tfidf.fit_transform(data.text).toarray(), columns=tfidf.get_feature_names_out())
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

### Embeeded method

In [6]:
rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)
res = sorted([(rfc.feature_importances_[i], c) for i,c in enumerate(X.columns)], key=lambda x: -x[0])

rfc_res = list(list(zip(*res))[1])[:30]

### Wrapper method

In [7]:
gaussianNB = GaussianNB()
wrapper_res = []
for i in range(30):
    best_score = 0.0
    best_feature = ''
    for feature in X_train:
        train = pd.DataFrame(X_train, columns=wrapper_res)
        test = pd.DataFrame(X_test, columns=wrapper_res)
        train[feature] = X_train[feature]
        test[feature] = X_test[feature]
        
        gaussianNB.fit(train.values, Y_train)

        score = accuracy_score(Y_test, gaussianNB.predict(test.values))
        if score > best_score:
            best_score = score
            best_feature = feature
    # print(i, best_feature)
    wrapper_res.append(best_feature)

### Filter method

In [8]:
filter_res = list(X_train[X_train.corrwith(Y_train).abs().sort_values(ascending=False)[:30].index].columns)

In [9]:
pd.DataFrame(data={
    "Embeeded method": rfc_res,
    "Wrapper method": wrapper_res,
    "Filter method": filter_res,
})

Unnamed: 0,Embeeded method,Wrapper method,Filter method
0,call,txt,txt
1,txt,claim,claim
2,free,box,free
3,mobile,service,mobile
4,claim,reply,prize
5,stop,camera,urgent
6,win,customer,call
7,prize,bak,win
8,text,nokia,stop
9,reply,apply,guaranteed


### Library methods

In [10]:
def get_by_mask(mask):
    return np.ma.compressed(np.ma.masked_where(mask != True, X_train.columns))

In [11]:
rfe = RFE(
    RandomForestClassifier(n_estimators=10),
    n_features_to_select=30,
    step=20
)
rfe.fit(X_train, Y_train)
rfe_res = np.ma.compressed(np.ma.masked_where(rfe.get_support() != True, X_train.columns))

In [12]:
skb = SelectKBest(score_func=chi2, k=30)
skb.fit(X_train, Y_train)
skb_res = sorted([(skb.scores_[i], c) for i,c in enumerate(X.columns)], key=lambda x: -x[0])
skb_res = list(list(zip(*skb_res))[1])[:30]

In [13]:
fpr = SelectFpr()
fpr.fit(X_train, Y_train)
fpr_res = sorted([(fpr.scores_[i], c) for i,c in enumerate(X.columns)], key=lambda x: -x[0])
fpr_res = list(list(zip(*fpr_res))[1])[:30]

In [14]:
pd.DataFrame(data={
    "embedded method (RFC)": rfc_res,
    "wrapper method (GaussianNB)": wrapper_res,
    "filter method (Pearson correlation)": filter_res,
    "library RFE with RFC": rfe_res,
    "library SelectKBest with chi2": skb_res,
    "library SelectFpr": fpr_res,
})

Unnamed: 0,embedded method (RFC),wrapper method (GaussianNB),filter method (Pearson correlation),library RFE with RFC,library SelectKBest with chi2,library SelectFpr
0,call,txt,txt,call,free,txt
1,txt,claim,claim,chat,txt,claim
2,free,box,free,claim,claim,free
3,mobile,service,mobile,free,prize,mobile
4,claim,reply,prize,get,mobile,prize
5,stop,camera,urgent,hi,urgent,urgent
6,win,customer,call,ill,stop,call
7,prize,bak,win,im,call,win
8,text,nokia,stop,mobile,win,stop
9,reply,apply,guaranteed,new,nokia,guaranteed


In [15]:
results = pd.DataFrame(index=["all features","RFC","Wrapper","Filter","RFE","chi2","FPR"])
for clf_name, clf in [
    ("KNN", KNeighborsClassifier()),
    ("RFC", RandomForestClassifier()),
    ("SVC", SVC())
]:
    clf_res = []
    for features in [
        X.columns,
        rfc_res,
        wrapper_res,
        filter_res,
        rfe_res,
        skb_res,
        fpr_res
    ]:
        x_train = pd.DataFrame(X_train, columns=features).values
        x_test = pd.DataFrame(X_test, columns=features).values
        clf.fit(x_train, Y_train)
        score = accuracy_score(Y_test, clf.predict(x_test))
        clf_res.append(score)
    results[clf_name] = clf_res

In [16]:
results

Unnamed: 0,KNN,RFC,SVC
all features,0.921076,0.973094,0.973094
RFC,0.955157,0.95157,0.956951
Wrapper,0.958744,0.959641,0.957848
Filter,0.95426,0.960538,0.952466
RFE,0.962332,0.956951,0.963229
chi2,0.955157,0.958744,0.95426
FPR,0.95426,0.960538,0.952466
