# Лабораторная работа № 5

In [252]:
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem import PorterStemmer
from sklearn.feature_selection import chi2
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import numpy as np



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tainazitina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Загрузка данных

In [253]:
human_data = pd.read_csv('human_text.txt', sep='\t', header=None, names=['text'])
robot_data = pd.read_csv('robot_text.txt', sep='\t', header=None, names=['text'])

# Добавление меток классов
human_data['label'] = 0  # 0 - человек
robot_data['label'] = 1  # 1 - робот

# Объединение данных
data = pd.concat([human_data, robot_data], ignore_index=True)


## Предобработка данных

In [254]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# в нижнем регистре
data['lowercase'] = data['text'].apply(lambda x: x.lower())

def stop_words(text):
    words = text.split()
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# без стоп слов
data['stop_words'] = data['text'].apply(lambda x: stop_words(x))

# vectorizer = CountVectorizer()
# X = vectorizer.fit_transform(data['text'])
# y = data['label']

# chi2_stat, p_values = chi2(X, y)
# selected_words_indices = np.argsort(chi2_stat)[-100:]
# selected_words = [vectorizer.get_feature_names_out()[i] for i in selected_words_indices]

# filtered_data = []
# for text in data['text']:
#   words = text.split()
#   filtered_words = [word for word in words if word in selected_words]
#   filtered_data.append(' '.join(filtered_words))
# # только специфичные слова
# data['filter'] = filtered_data
# data

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tainazitina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [255]:
from collections import Counter

human_combined_string = ' '.join(human_data['text'])
human_words = human_combined_string.split()
human_word_frequency = Counter(human_words)
human_frequency_dict = {key: value / len(human_words) for key, value in human_word_frequency.items()}


robot_combined_string = ' '.join(robot_data['text'])
robot_words = robot_combined_string.split()
robot_word_frequency = Counter(robot_words)
robot_frequency_dict = {key: value / len(robot_words) for key, value in robot_word_frequency.items()}
  
unique_words = set(human_words + robot_words)

result_array = []
for word in unique_words:
  freq1 = human_frequency_dict.get(word, 0)
  freq2 = robot_frequency_dict.get(word, 0)
  result_array.append([word, freq1, freq2])

unic_human_word = []
unic_robot_word = []

for i in range(len(result_array)):
  if result_array[i][1] == 0:
    unic_robot_word.append(result_array[i][0])
  if result_array[i][2] == 0:
    unic_human_word.append(result_array[i][0])
  
filtered_data = []
for text in data['text']:
  words = text.split()
  filtered_words = [word for word in words if word in unic_human_word or word in unic_robot_word]
  filtered_data.append(' '.join(filtered_words))
# только специфичные слова
data['filter'] = filtered_data

df = pd.DataFrame(result_array, columns = ['text', 'human', 'robot'])
df

Unnamed: 0,text,human,robot
0,friedrich,0.000000,0.000043
1,why,0.002416,0.001805
2,ago..,0.000000,0.000043
3,estaba,0.000045,0.000043
4,93%,0.000000,0.000043
...,...,...,...
6691,"dice,",0.000045,0.000000
6692,"design,",0.000045,0.000000
6693,gentlemen.,0.000045,0.000000
6694,uuuuuuh,0.000045,0.000000


In [264]:
data[-100:]

Unnamed: 0,text,label,lowercase,stop_words,filter
4597,si ?,1,si ?,si ?,
4598,no tengo el placer de conocer ese lugar ! tu ...,1,no tengo el placer de conocer ese lugar ! tu ...,tengo el placer de conocer ese lugar ! tu que ...,placer conocer
4599,me gustaría conocerlo entonces ! ! 😁,1,me gustaría conocerlo entonces ! ! 😁,gustaría conocerlo entonces ! ! 😁,conocerlo
4600,ooooh !,1,ooooh !,ooooh !,ooooh
4601,iiiiiiiiiiih !,1,iiiiiiiiiiih !,iiiiiiiiiiih !,iiiiiiiiiiih
...,...,...,...,...,...
4692,hi here ! how are you ?,1,hi here ! how are you ?,hi ! ?,
4693,nice ! i'm fine too,1,nice ! i'm fine too,nice ! i'm fine,
4694,what means m ?,1,what means m ?,means ?,
4695,hi there ! ! how are you ? 😄,1,hi there ! ! how are you ? 😄,hi ! ! ? 😄,


In [257]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)
X_train_lower, X_test_lower, y_train_lower, y_test_lower = train_test_split(data['lowercase'], data['label'], test_size=0.2, random_state=42)
X_train_stop, X_test_stop, y_train_stop, y_test_stop = train_test_split(data['stop_words'], data['label'], test_size=0.2, random_state=42)
X_train_filter, X_test_filter, y_train_filter, y_test_filter = train_test_split(data['filter'], data['label'], test_size=0.2, random_state=42)

In [258]:
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

X_train_lower_vectorized = vectorizer.fit_transform(X_train_lower)
X_test_lower_vectorized = vectorizer.transform(X_test_lower)

X_train_stop_vectorized = vectorizer.fit_transform(X_train_stop)
X_test_stop_vectorized = vectorizer.transform(X_test_stop)

X_train_filter_vectorized = vectorizer.fit_transform(X_train_filter)
X_test_filter_vectorized = vectorizer.transform(X_test_filter)

## NaiveBayes

In [259]:
NaiveBayes = MultinomialNB()

NaiveBayes.fit(X_train_vectorized, y_train)
y_pred = NaiveBayes.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f'{NaiveBayes} Accuracy original text: {accuracy:.4f}')

NaiveBayes.fit(X_train_lower_vectorized, y_train_lower)
y_pred = NaiveBayes.predict(X_test_lower_vectorized)
accuracy = accuracy_score(y_test_lower, y_pred)
print(f'{NaiveBayes} Accuracy lower text: {accuracy:.4f}')

NaiveBayes.fit(X_train_stop_vectorized, y_train_stop)
y_pred = NaiveBayes.predict(X_test_stop_vectorized)
accuracy = accuracy_score(y_test_stop, y_pred)
print(f'{NaiveBayes} Accuracy without stopwords: {accuracy:.4f}')

NaiveBayes.fit(X_train_filter_vectorized, y_train_filter)
y_pred = NaiveBayes.predict(X_test_filter_vectorized)
accuracy = accuracy_score(y_test_filter, y_pred)
print(f'{NaiveBayes} Accuracy without filter words: {accuracy:.4f}')

MultinomialNB() Accuracy original text: 0.7160
MultinomialNB() Accuracy lower text: 0.7160
MultinomialNB() Accuracy without stopwords: 0.6915
MultinomialNB() Accuracy without filter words: 0.6936


## SVM

In [260]:
SVM = SVC()

SVM.fit(X_train_vectorized, y_train)
y_pred = SVM.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f'{SVM} Accuracy original text: {accuracy:.4f}')

SVM.fit(X_train_lower_vectorized, y_train_lower)
y_pred = SVM.predict(X_test_lower_vectorized)
accuracy = accuracy_score(y_test_lower, y_pred)
print(f'{SVM} Accuracy lower text: {accuracy:.4f}')

SVM.fit(X_train_stop_vectorized, y_train_stop)
y_pred = SVM.predict(X_test_stop_vectorized)
accuracy = accuracy_score(y_test_stop, y_pred)
print(f'{SVM} Accuracy without stopwords: {accuracy:.4f}')

SVM.fit(X_train_filter_vectorized, y_train_filter)
y_pred = SVM.predict(X_test_filter_vectorized)
accuracy = accuracy_score(y_test_filter, y_pred)
print(f'{SVM} Accuracy without filter words: {accuracy:.4f}')

SVC() Accuracy original text: 0.7330
SVC() Accuracy lower text: 0.7330
SVC() Accuracy without stopwords: 0.6883
SVC() Accuracy without filter words: 0.6798


## kNN

In [261]:
kNN = KNeighborsClassifier()

kNN.fit(X_train_vectorized, y_train)
y_pred = kNN.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f'{kNN} Accuracy original text: {accuracy:.4f}')

kNN.fit(X_train_lower_vectorized, y_train_lower)
y_pred = kNN.predict(X_test_lower_vectorized)
accuracy = accuracy_score(y_test_lower, y_pred)
print(f'{kNN} Accuracy lower text: {accuracy:.4f}')

kNN.fit(X_train_stop_vectorized, y_train_stop)
y_pred = kNN.predict(X_test_stop_vectorized)
accuracy = accuracy_score(y_test_stop, y_pred)
print(f'{kNN} Accuracy without stopwords: {accuracy:.4f}')

kNN.fit(X_train_filter_vectorized, y_train_filter)
y_pred = kNN.predict(X_test_filter_vectorized)
accuracy = accuracy_score(y_test_filter, y_pred)
print(f'{kNN} Accuracy without filter words: {accuracy:.4f}')

KNeighborsClassifier() Accuracy original text: 0.6021
KNeighborsClassifier() Accuracy lower text: 0.6021
KNeighborsClassifier() Accuracy without stopwords: 0.5862
KNeighborsClassifier() Accuracy without filter words: 0.5809


## RandomForest

In [262]:
RandomForest = RandomForestClassifier()

RandomForest.fit(X_train_vectorized, y_train)
y_pred = RandomForest.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f'{RandomForest} Accuracy original text: {accuracy:.4f}')

feature_names = vectorizer.get_feature_names_out()
feature_importance = RandomForest.feature_importances_
important_features = pd.DataFrame(list(zip(feature_names, feature_importance)), columns=['Feature', 'Importance'])
print(f'Top 10 important features for {RandomForest}:\n{important_features.head(10)}')

RandomForest.fit(X_train_lower_vectorized, y_train_lower)
y_pred = RandomForest.predict(X_test_lower_vectorized)
accuracy = accuracy_score(y_test_lower, y_pred)
print(f'{RandomForest} Accuracy lower text: {accuracy:.4f}')

feature_names = vectorizer.get_feature_names_out()
feature_importance = RandomForest.feature_importances_
important_features = pd.DataFrame(list(zip(feature_names, feature_importance)), columns=['Feature', 'Importance'])
print(f'Top 10 important features for {RandomForest}:\n{important_features.head(10)}')

RandomForest.fit(X_train_stop_vectorized, y_train_stop)
y_pred = RandomForest.predict(X_test_stop_vectorized)
accuracy = accuracy_score(y_test_stop, y_pred)
print(f'{RandomForest} Accuracy without stopwords: {accuracy:.4f}')

feature_names = vectorizer.get_feature_names_out()
feature_importance = RandomForest.feature_importances_
important_features = pd.DataFrame(list(zip(feature_names, feature_importance)), columns=['Feature', 'Importance'])
print(f'Top 10 important features for {RandomForest}:\n{important_features.head(10)}')

RandomForest.fit(X_train_filter_vectorized, y_train_filter)
y_pred = RandomForest.predict(X_test_filter_vectorized)
accuracy = accuracy_score(y_test_filter, y_pred)
print(f'{RandomForest} Accuracy without filter words: {accuracy:.4f}')

feature_names = vectorizer.get_feature_names_out()
feature_importance = RandomForest.feature_importances_
important_features = pd.DataFrame(list(zip(feature_names, feature_importance)), columns=['Feature', 'Importance'])
print(f'Top 10 important features for {RandomForest}:\n{important_features.head(10)}')

RandomForestClassifier() Accuracy original text: 0.7309
Top 10 important features for RandomForestClassifier():
  Feature    Importance
0      00  4.119896e-05
1     000  2.932514e-07
2  00ffff  4.523483e-05
3      01  1.035812e-05
4      02  0.000000e+00
5      03  3.715093e-06
6      04  1.175418e-05
7      05  0.000000e+00
8      07  0.000000e+00
9      10  5.310937e-05
RandomForestClassifier() Accuracy lower text: 0.7309
Top 10 important features for RandomForestClassifier():
  Feature    Importance
0      00  1.250705e-04
1     000  6.616313e-07
2  00ffff  1.805533e-05
3      01  1.567021e-06
4      02  0.000000e+00
5      03  7.317620e-06
6      04  2.682095e-06
7      05  0.000000e+00
8      07  0.000000e+00
9      10  3.643545e-05
RandomForestClassifier() Accuracy without stopwords: 0.6670
Top 10 important features for RandomForestClassifier():
  Feature  Importance
0      00    0.000052
1     000    0.000004
2  00ffff    0.000100
3      01    0.000007
4      02    0.000000
5  

## LogisticRegression

In [263]:
LogisticRegression = LogisticRegression()

LogisticRegression.fit(X_train_vectorized, y_train)
y_pred = LogisticRegression.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f'{LogisticRegression} Accuracy original text: {accuracy:.4f}')

important_features = important_features.sort_values(by='Importance', ascending=False)
feature_importance = np.abs(LogisticRegression.coef_[0])
important_features = pd.DataFrame(list(zip(feature_names, feature_importance)), columns=['Feature', 'Importance'])
important_features = important_features.sort_values(by='Importance', ascending=False)
print(f'Top 10 important features for {LogisticRegression}:\n{important_features.head(10)}')

LogisticRegression.fit(X_train_lower_vectorized, y_train_lower)
y_pred = LogisticRegression.predict(X_test_lower_vectorized)
accuracy = accuracy_score(y_test_lower, y_pred)
print(f'{LogisticRegression} Accuracy lower text: {accuracy:.4f}')

important_features = important_features.sort_values(by='Importance', ascending=False)
feature_importance = np.abs(LogisticRegression.coef_[0])
important_features = pd.DataFrame(list(zip(feature_names, feature_importance)), columns=['Feature', 'Importance'])
important_features = important_features.sort_values(by='Importance', ascending=False)
print(f'Top 10 important features for {LogisticRegression}:\n{important_features.head(10)}')

LogisticRegression.fit(X_train_stop_vectorized, y_train_stop)
y_pred = LogisticRegression.predict(X_test_stop_vectorized)
accuracy = accuracy_score(y_test_stop, y_pred)
print(f'{LogisticRegression} Accuracy without stopwords: {accuracy:.4f}')

important_features = important_features.sort_values(by='Importance', ascending=False)
feature_importance = np.abs(LogisticRegression.coef_[0])
important_features = pd.DataFrame(list(zip(feature_names, feature_importance)), columns=['Feature', 'Importance'])
important_features = important_features.sort_values(by='Importance', ascending=False)
print(f'Top 10 important features for {LogisticRegression}:\n{important_features.head(10)}')

LogisticRegression.fit(X_train_filter_vectorized, y_train_filter)
y_pred = LogisticRegression.predict(X_test_filter_vectorized)
accuracy = accuracy_score(y_test_filter, y_pred)
print(f'{LogisticRegression} Accuracy without filter words: {accuracy:.4f}')

important_features = important_features.sort_values(by='Importance', ascending=False)
feature_importance = np.abs(LogisticRegression.coef_[0])
important_features = pd.DataFrame(list(zip(feature_names, feature_importance)), columns=['Feature', 'Importance'])
important_features = important_features.sort_values(by='Importance', ascending=False)
print(f'Top 10 important features for {LogisticRegression}:\n{important_features.head(10)}')


LogisticRegression() Accuracy original text: 0.7085
Top 10 important features for LogisticRegression():
             Feature  Importance
1974           mejor    3.516494
1184        extremas    3.230081
2715          seemed    3.155412
985          driving    2.616612
2088           named    2.348790
1408           grupo    2.253971
2650  rottentomatoes    2.200000
2101  necesariamente    2.195582
1874           llama    2.003890
1868        listened    1.895497
LogisticRegression() Accuracy lower text: 0.7085
Top 10 important features for LogisticRegression():
             Feature  Importance
1974           mejor    3.516494
1184        extremas    3.230081
2715          seemed    3.155412
985          driving    2.616612
2088           named    2.348790
1408           grupo    2.253971
2650  rottentomatoes    2.200000
2101  necesariamente    2.195582
1874           llama    2.003890
1868        listened    1.895497
LogisticRegression() Accuracy without stopwords: 0.6989
Top 10 import