# TF-IDF векторизация

TF-IDF (TF — term frequency, IDF — inverse document frequency)  — это способ векторизации текста, отражающий важность слова в документе (относительно некоторого набора документов).

TF-IDF является произведением TF и IDF.
$$TF-IDF(w,d,D)=TF(w,d)*IDF(w,D)$$

Посмотрим на примере.

Частота слов (TF) — это мера частоты употребления слова $w$ в документе $d$. TF определяется как отношение появления слова в документе к общему количеству слов в документе.

$$TF(w,d) = \frac{количество\:вхождений\:слова\:w\:в\:документе\:d}{общее\:количество\:слов\:n\:в\:документе\:d}$$

Обратная частота документов (IDF) —  это мера важности слова. Некоторые слова могут присутствовать наиболее часто, но не имеют большого значения. IDF присваивает вес каждому слову в зависимости от его частоты в корпусе $D$.

$$IDF(w,D) = ln(\frac{общее\:количество\:документов\:N\:в\:корпусе\:D}{количество\:документов,\:содержащих\:слово\:w})$$

Чем TF больше, тем слово в документе важнее.

Чем IDF больше, тем меньше документов, в которых слово встречается.

Допустим, у нас есть набор предложений:

I love cats

That man sat on my hat

That man has a cat

My cat is under my hat

In [None]:
import numpy as np

In [None]:
np.log(4/2)

np.float64(0.6931471805599453)

In [None]:
0.69/3

0.22999999999999998

TF(cat, 1): 1/3

TF(my, 4): 2/6 = 1/3

TF-IDF(my, 4):

TF-IDF(my, 1):

In [None]:
import pandas as pd

In [None]:
data = pd.DataFrame({'id': [1, 2, 3, 4], 'text': ['I love cats', 'That man sat on my hat', 'That man has a cat', 'My cat is under my hat']})

In [None]:
data

Unnamed: 0,id,text
0,1,I love cats
1,2,That man sat on my hat
2,3,That man has a cat
3,4,My cat is under my hat


In [None]:
text.rename(columns=inv_dict)

Unnamed: 0,cat,cats,hat,love,man,sat
0,0.0,0.707107,0.0,0.707107,0.0,0.0
1,0.0,0.0,0.526405,0.0,0.526405,0.667679
2,0.707107,0.0,0.0,0.0,0.707107,0.0
3,0.707107,0.0,0.707107,0.0,0.0,0.0


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(
    stop_words = 'english'
)
tfidf.fit(data.text) #data['text']
text = tfidf.transform(data.text)

In [None]:
text

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9 stored elements and shape (4, 6)>

In [None]:
text = pd.DataFrame.sparse.from_spmatrix(text)

In [None]:
text

Unnamed: 0,0,1,2,3,4,5
0,0.0,0.707107,0.0,0.707107,0.0,0.0
1,0.0,0.0,0.526405,0.0,0.526405,0.667679
2,0.707107,0.0,0.0,0.0,0.707107,0.0
3,0.707107,0.0,0.707107,0.0,0.0,0.0


In [None]:
tfidf.vocabulary_

{'love': 3, 'cats': 1, 'man': 4, 'sat': 5, 'hat': 2, 'cat': 0}

In [None]:
inv_dict = {}
for key, value in tfidf.vocabulary_.items():
  inv_dict.update({value: key})
inv_dict

{3: 'love', 1: 'cats', 4: 'man', 5: 'sat', 2: 'hat', 0: 'cat'}

#One-hot encoding

In [None]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
df.shape

(891, 12)

In [None]:
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,0
SibSp,0
Parch,0
Ticket,0
Fare,0


In [None]:
df['Age'] = df['Age'].fillna(df.Age.mean())

In [None]:
df = df.drop(columns=['Cabin'])

In [None]:
df = df.dropna()

In [None]:
df.shape

(889, 11)

In [None]:
x = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

In [None]:
x['Sex'].unique()

array(['male', 'female'], dtype=object)

male - 0, 1

female - 1, 0

In [None]:
x['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

S - 0 0 1

c - 0 1 0

Q - 1 0 0

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
enc = OneHotEncoder(sparse_output=False)
enc.fit(x[['Sex', 'Embarked']])

In [None]:
enc.categories_

[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]

In [None]:
x_c = pd.DataFrame(enc.transform(x[['Sex', 'Embarked']]))

In [None]:
x_c

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...
884,0.0,1.0,0.0,0.0,1.0
885,1.0,0.0,0.0,0.0,1.0
886,1.0,0.0,0.0,0.0,1.0
887,0.0,1.0,1.0,0.0,0.0


In [None]:
x_c.index

RangeIndex(start=0, stop=889, step=1)

In [None]:
x.reset_index()

Unnamed: 0,index,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.000000,1,0,7.2500,S
1,1,1,female,38.000000,1,0,71.2833,C
2,2,3,female,26.000000,0,0,7.9250,S
3,3,1,female,35.000000,1,0,53.1000,S
4,4,3,male,35.000000,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
884,886,2,male,27.000000,0,0,13.0000,S
885,887,1,female,19.000000,0,0,30.0000,S
886,888,3,female,29.699118,1,2,23.4500,S
887,889,1,male,26.000000,0,0,30.0000,C


In [None]:
x_final = pd.merge(x.reset_index(), x_c, left_index=True, right_index=True)
x_final

Unnamed: 0,index,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,0,1,2,3,4
0,0,3,male,22.000000,1,0,7.2500,S,0.0,1.0,0.0,0.0,1.0
1,1,1,female,38.000000,1,0,71.2833,C,1.0,0.0,1.0,0.0,0.0
2,2,3,female,26.000000,0,0,7.9250,S,1.0,0.0,0.0,0.0,1.0
3,3,1,female,35.000000,1,0,53.1000,S,1.0,0.0,0.0,0.0,1.0
4,4,3,male,35.000000,0,0,8.0500,S,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,886,2,male,27.000000,0,0,13.0000,S,0.0,1.0,0.0,0.0,1.0
885,887,1,female,19.000000,0,0,30.0000,S,1.0,0.0,0.0,0.0,1.0
886,888,3,female,29.699118,1,2,23.4500,S,1.0,0.0,0.0,0.0,1.0
887,889,1,male,26.000000,0,0,30.0000,C,0.0,1.0,1.0,0.0,0.0


In [None]:
x_final = x_final.drop(columns=['index', 'Sex', 'Embarked'])

In [None]:
x_final

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,0,1,2,3,4
0,3,22.000000,1,0,7.2500,0.0,1.0,0.0,0.0,1.0
1,1,38.000000,1,0,71.2833,1.0,0.0,1.0,0.0,0.0
2,3,26.000000,0,0,7.9250,1.0,0.0,0.0,0.0,1.0
3,1,35.000000,1,0,53.1000,1.0,0.0,0.0,0.0,1.0
4,3,35.000000,0,0,8.0500,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
884,2,27.000000,0,0,13.0000,0.0,1.0,0.0,0.0,1.0
885,1,19.000000,0,0,30.0000,1.0,0.0,0.0,0.0,1.0
886,3,29.699118,1,2,23.4500,1.0,0.0,0.0,0.0,1.0
887,1,26.000000,0,0,30.0000,0.0,1.0,1.0,0.0,0.0


In [None]:
x_final.dtypes

Unnamed: 0,0
Pclass,int64
Age,float64
SibSp,int64
Parch,int64
Fare,float64
0,float64
1,float64
2,float64
3,float64
4,float64


#Подбор оптимальных гиперпараметров модели

Гиперпараметры - это те характеристики модели, которые мы задаем изначально при её инициализации

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV

In [None]:
lm = LogisticRegressionCV()

In [None]:
params = {
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'] #,
    # PAR1: [v1, v2, v3],
    # PAR2: [v4, v5, v6]
}

In [None]:
gs = GridSearchCV(lm, params)

In [None]:
gs.fit(x_final.to_numpy(), df['Survived'].to_numpy())

In [None]:
preds = gs.best_estimator_.predict(x_final.to_numpy())

In [None]:
preds[:10]

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1])

In [None]:
tr = df['Survived'].to_numpy()

In [None]:
tr[:10]

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1])

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(tr, preds)

0.8008998875140607

#Практикум

В продолжение предыдущего задания:



1.   Предобработать датасет со спамом, векторизовать его по методу tf-idf и провести классификацию.
2.   Для модели, с помощью которой вы проводили классификацию, провести gridsearch и подобрать оптимальные параметры.



In [1]:
import pandas as pd

df = pd.read_csv('spam.csv', encoding='latin-1')

df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Предобработка данных

In [3]:
import string

df['text'] = (
    df['text']
    .str.lower()
    .str.replace(f"[{string.punctuation}]", "", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

df.head()

Unnamed: 0,label,text
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...


Разделяем данные на обучаюущую выборку и тестовую(80 и 20 % соответственно)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

print("Размер обучающей выборки:", X_train.shape[0])
print("Размер тестовой выборки:", X_test.shape[0])


Размер обучающей выборки: 4457
Размер тестовой выборки: 1115


Векторизирую с помощью метода TF-IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("Размер обучающей матрицы:", X_train_tfidf.shape)
print("Размер тестовой матрицы:", X_test_tfidf.shape)


Размер обучающей матрицы: (4457, 3000)
Размер тестовой матрицы: (1115, 3000)


Обучаем модель классификации

In [6]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

preds = model.predict(X_test_tfidf)


Оценка качества модели


In [7]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, preds))
print("\nОтчет по классам:\n", classification_report(y_test, preds))


Accuracy: 0.957847533632287

Отчет по классам:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       0.98      0.70      0.82       150

    accuracy                           0.96      1115
   macro avg       0.97      0.85      0.90      1115
weighted avg       0.96      0.96      0.95      1115



Подбираю гиперпараметры с помощью GridSearchCV

In [8]:
from sklearn.model_selection import GridSearchCV

params = {
    'C': [0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear']
}

grid = GridSearchCV(
    LogisticRegression(max_iter=1000),
    param_grid=params,
    cv=5,
    n_jobs=-1
)

grid.fit(X_train_tfidf, y_train)

print("Лучшие параметры:", grid.best_params_)
print("Лучшая средняя точность (cv):", grid.best_score_)


Лучшие параметры: {'C': 10, 'solver': 'lbfgs'}
Лучшая средняя точность (cv): 0.9775618667995349
