# ЛР №6 по курсу "Методы машинного обучения"  
# Классификация текста.  
**Чжан Чжибо  ИУ5И-21М**  
**Цель лабораторной работы**: изучение методов классификации текстов.  
**Задание**:  
Для произвольного набора данных, предназначенного для классификации текстов, решите задачу классификации текста двумя способами:

Способ 1. На основе CountVectorizer или TfidfVectorizer.  
Способ 2. На основе моделей word2vec или Glove или fastText.  
Сравните качество полученных моделей.

In [6]:
import numpy as np
import pandas as pd
from typing import Dict, Tuple
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score 
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from collections import Counter
from sklearn.datasets import fetch_20newsgroups
import matplotlib.pyplot as plt

%matplotlib inline 
sns.set(style="ticks")

In [7]:
def accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray) -> Dict[int, float]:
    """
    Вычисление метрики accuracy для каждого класса
    y_true - истинные значения классов
    y_pred - предсказанные значения классов
    Возвращает словарь: ключ - метка класса, 
    значение - Accuracy для данного класса
    """
    # Для удобства фильтрации сформируем Pandas DataFrame 
    d = {'t': y_true, 'p': y_pred}
    df = pd.DataFrame(data=d)
    # Метки классов
    classes = np.unique(y_true)
    # Результирующий словарь
    res = dict()
    # Перебор меток классов
    for c in classes:
        # отфильтруем данные, которые соответствуют 
        # текущей метке класса в истинных значениях
        temp_data_flt = df[df['t']==c]
        # расчет accuracy для заданной метки класса
        temp_acc = accuracy_score(
            temp_data_flt['t'].values, 
            temp_data_flt['p'].values)
        # сохранение результата в словарь
        res[c] = temp_acc
    return res

def print_accuracy_score_for_classes(
    y_true: np.ndarray, 
    y_pred: np.ndarray):
    """
    Вывод метрики accuracy для каждого класса
    """
    accs = accuracy_score_for_classes(y_true, y_pred)
    if len(accs)>0:
        print('Метка \t Accuracy')
    for i in accs:
        print('{} \t {}'.format(i, accs[i]))

Загрузка данных:

In [8]:
%cd /content/drive/MyDrive/dataset/spam/

/content/drive/MyDrive/dataset/spam


In [9]:
dataset = pd.read_csv("enron_spam_data.csv")
dataset.head()

Unnamed: 0.1,Unnamed: 0,Subject,Message,Spam/Ham,Date
0,0,christmas tree farm pictures,,ham,1999-12-10
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14


In [10]:
dataset=dataset.drop(['Unnamed: 0','Subject','Date'],axis=1)
dataset.head()

Unnamed: 0,Message,Spam/Ham
0,,ham
1,"gary , production from the high island larger ...",ham
2,- calpine daily gas nomination 1 . doc,ham
3,fyi - see note below - already done .\nstella\...,ham
4,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham


In [11]:
dataset['Spam/Ham']=dataset['Spam/Ham'].replace(['ham','spam'],[0,1])
dataset.head()

Unnamed: 0,Message,Spam/Ham
0,,0
1,"gary , production from the high island larger ...",0
2,- calpine daily gas nomination 1 . doc,0
3,fyi - see note below - already done .\nstella\...,0
4,fyi .\n- - - - - - - - - - - - - - - - - - - -...,0


In [12]:
dataset=dataset.dropna()
dataset.head()

Unnamed: 0,Message,Spam/Ham
1,"gary , production from the high island larger ...",0
2,- calpine daily gas nomination 1 . doc,0
3,fyi - see note below - already done .\nstella\...,0
4,fyi .\n- - - - - - - - - - - - - - - - - - - -...,0
5,"jackie ,\nsince the inlet to 3 river plant is ...",0


In [17]:
dataset=dataset.sample(frac=1)
dataset.head()

Unnamed: 0,Message,Spam/Ham
7683,"gentleman ,\nkevin presto concurred on the pur...",0
434,daren or stacey : could you please extend deal...,0
19561,start date : 2 / 6 / 02 ; hourahead hour : 24 ...,1
31175,"fyi , kim .\n- - - - - original message - - - ...",1
27668,attached is the latest version of the cost cen...,1


In [18]:
dataset.describe()

Unnamed: 0,Spam/Ham
count,33664.0
mean,0.51007
std,0.499906
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [20]:
train_df=(dataset.iloc[0:26664,:])
test_df=(dataset.iloc[26664:33664,:])

In [23]:
train_df.describe()

Unnamed: 0,Spam/Ham
count,26664.0
mean,0.509338
std,0.499922
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [22]:
test_df.describe()

Unnamed: 0,Spam/Ham
count,7000.0
mean,0.512857
std,0.49987
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


# Способ 1. На основе CountVectorizer или TfidfVectorizer.

Сформируем общий словарь для обучения моделей из обучающей и тестовой выборки

In [24]:
vocab_list = train_df['Message'].tolist()
vocab_list[1:10]

['daren or stacey : could you please extend deal # 169625 for meter 1520 for\nfeb . 2000 ?\nthanks .\naimee\n- - - - - - - - - - - - - - - - - - - - - - forwarded by aimee lannou / hou / ect on 03 / 11 / 2000 09 : 46\nam - - - - - - - - - - - - - - - - - - - - - - - - - - -\nanita luong\n03 / 10 / 2000 05 : 23 pm\nto : aimee lannou / hou / ect @ ect\ncc :\nsubject : re : allocation exceptions\nmeter 5191 track id is 89328 .\nmeter 1520 need to extend or deal or add new deal .\naimee lannou 03 / 10 / 2000 02 : 57 pm\nto : anita luong / hou / ect @ ect\ncc :\nsubject : allocation exceptions\nanita - i need some accounting arrangements created for two meters .\nmeter contract receipt / delivery counterparty deal # last month allocated\n5191 gathering receipt tri - union development 100896\ncorp . 138661 jan 00\n1520 hpl 215 delivery engage 169625 jan 00\nif youneed more information , please let e know .\n- aimee',
 "start date : 2 / 6 / 02 ; hourahead hour : 24 ; no ancillary schedules aw

In [25]:
def VectorizeAndClassify(vectorizers_list, classifiers_list):
    for v in vectorizers_list:
        for c in classifiers_list:
            pipeline1 = Pipeline([("vectorizer", v), ("classifier", c)])
            score = cross_val_score(pipeline1, train_df['Message'], train_df['Spam/Ham'], scoring='accuracy', cv=3).mean()
            print('Векторизация - {}'.format(v))
            print('Модель для классификации - {}'.format(c))
            print('Accuracy = {}'.format(score))
            print('===========================')

In [26]:
vocabVect = CountVectorizer()
vocabVect.fit(vocab_list)
corpusVocab = vocabVect.vocabulary_
print('Количество сформированных признаков - {}'.format(len(corpusVocab)))

Количество сформированных признаков - 54038


In [27]:
vectorizers_list = [CountVectorizer(vocabulary = corpusVocab), TfidfVectorizer(vocabulary = corpusVocab)]
classifiers_list = [LogisticRegression(), MultinomialNB()]
VectorizeAndClassify(vectorizers_list, classifiers_list)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Векторизация - CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None,
                vocabulary={'00': 0, '000': 1, '0000': 2, '000000': 3,
                            '00...
                            '000000000005413': 12, '000000000005820': 13,
                            '000000000006238': 14, '000000000007399': 15,
                            '000000000007494': 16, '000000000007498': 17,
                            '000000000007588': 18, '000000000007589': 19,
                            '000000000007590': 20, '000000000007591': 21,
                            '000000000007592': 22, '000000000007593': 23,
                            '00000000

Лучшую точность показал CountVectorizer и LogisticRegression (99,93%)

In [31]:
X_train=train_df['Message']
y_train=train_df['Spam/Ham']
X_test=test_df['Message']
y_test=test_df['Spam/Ham']

In [32]:
def sentiment(v, c):
    model = Pipeline(
        [("vectorizer", v), 
         ("classifier", c)])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print_accuracy_score_for_classes(y_test, y_pred)

In [33]:
sentiment(CountVectorizer(), LogisticRegression(C=5.0))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Метка 	 Accuracy
0 	 0.9994134897360704
1 	 1.0


# Способ 2. На основе моделей word2vec

In [34]:
import re
import pandas as pd
import numpy as np
from typing import Dict, Tuple
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from nltk import WordPunctTokenizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Подготовим корпус

In [53]:
corpus = []
stop_words = stopwords.words('english')
tok = WordPunctTokenizer()
for line in dataset['Message'].values:
    line1 = line.strip().lower()
    line1 = re.sub("[^a-zA-Z]"," ", line1)
    text_tok = tok.tokenize(line1)
    text_tok1 = [w for w in text_tok if not w in stop_words]
    corpus.append(text_tok1)


In [54]:
corpus[:5]

[['gentleman',
  'kevin',
  'presto',
  'concurred',
  'purchase',
  'site',
  'license',
  'recommended',
  'vince',
  'thoughts',
  'others',
  'available',
  'demo',
  'package',
  'others',
  'would',
  'like',
  'see',
  'thanks',
  'lance',
  'forwarded',
  'lance',
  'cunningham',
  'na',
  'enron',
  'pm',
  'vince',
  'j',
  'kaminski',
  'ect',
  'vince',
  'j',
  'kaminski',
  'hou',
  'ect',
  'ect',
  'richard',
  'lewis',
  'lon',
  'ect',
  'ect',
  'tim',
  'belden',
  'hou',
  'ect',
  'ect',
  'tim',
  'heizenrader',
  'enron',
  'com',
  'kevin',
  'presto',
  'hou',
  'ect',
  'ect',
  'george',
  'hopley',
  'hou',
  'ect',
  'ect',
  'cc',
  'lance',
  'cunningham',
  'na',
  'enron',
  'enron',
  'subject',
  'site',
  'license',
  'power',
  'world',
  'gentlemen',
  'recommend',
  'purchase',
  'package',
  'split',
  'cost',
  'ways',
  'power',
  'trading',
  'desks',
  'think',
  'go',
  'option',
  'lance',
  'cunningham',
  'group',
  'looked',
  'software

Количество текстов в корпусе не изменилось и соответствует целевому признаку

In [56]:
assert dataset.shape[0]==len(corpus)

In [57]:
import gensim
from gensim.models import word2vec
%time model = word2vec.Word2Vec(corpus, workers=4, min_count=10, window=10, sample=1e-3)

CPU times: user 1min 18s, sys: 442 ms, total: 1min 18s
Wall time: 42.3 s


Проверим, что модель обучилась

In [58]:
print(model.wv.most_similar(positive=['find'], topn=5))

[('contacts', 0.4571227431297302), ('complete', 0.4518755376338959), ('internally', 0.44660910964012146), ('see', 0.44049549102783203), ('samples', 0.4352661371231079)]


In [64]:
def sentiment(v, c):
    model = Pipeline(
        [("vectorizer", v), 
         ("classifier", c)])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print_accuracy_score_for_classes(y_test, y_pred)

In [65]:
class EmbeddingVectorizer(object):
    def __init__(self, model):
        self.model = model
        self.size = model.vector_size

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([np.mean(
            [self.model[w] for w in words if w in self.model] 
            or [np.zeros(self.size)], axis=0)
            for words in X])

In [67]:
boundary = 26664
X_train = corpus[:boundary] 
X_test = corpus[boundary:]
y_train = dataset['Spam/Ham'][:boundary]
y_test = dataset['Spam/Ham'][boundary:]

In [68]:
sentiment(EmbeddingVectorizer(model.wv), LogisticRegression(C=5.0))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Метка 	 Accuracy
0 	 0.9982404692082112
1 	 1.0


Лучшую точность показал CountVectorizer и LogisticRegression 