In [1]:
import pandas as pd
from scipy import stats as st

import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
import numpy as np

## Задание 1

In [2]:
df = pd.read_csv('vgsales.csv')
df.head()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,,,


In [3]:
df_ = df[df['Critic_Score'].notna()] #из-за значений NaN в столбце, pvalue становился равным nan
                                    #nan_policy для одностороннего теста не работает 

За alpha здесь и далее будем принимать 0.05

### 1) Как критики относятся к спортивным играм?

H0: средняя удовлетворенность критиков спортивными играми <= 70 баллов  <br> 
H1: средняя удовлетворенность критиков спортивными играми составляет более 70 баллов <br> 

Взял порог в 70 баллов, потому что сложилось, что ниже ~70 игра считается не очень хорошей

In [4]:
df_sport = df_[(df_.Genre == 'Sports')]

In [5]:
alpha = 0.05
result = st.ttest_1samp(df_sport['Critic_Score'], 70, alternative='greater')

print(df_sport['Critic_Score'].mean())
print(result)

if result.pvalue < alpha: 
    print('Отвергаем нулевую гипотезу, среднее больше 70 (жанр спортивных игр нравится)')
else:
    print('Не отвергаем нулевую нулевую гипотезу, жанр спортивных игр не нравится критикам')

71.96817420435511
Ttest_1sampResult(statistic=4.900329475499578, pvalue=5.442744598708131e-07)
Отвергаем нулевую гипотезу, среднее больше 70 (жанр спортивных игр нравится)


### 2) Критикам нравятся больше игры на PC или на PS4?

H0: средняя оценка критиков PC и PS4 одинаковая  
H1: средняя оценка критиков PC и PS4 различается

In [6]:
df_pc = df_[(df_.Platform == 'PC')]
df_ps4 = df_[(df_.Platform == 'PS4')]

In [7]:
result = st.ttest_ind(df_pc['Critic_Score'], df_ps4['Critic_Score'], equal_var=False)
print(result)

if (result.pvalue < alpha):
    print('Отвергаем нулевую гипотезу, критики по-разному оценивают игры на PC и PS4')
else:
    print('Не отвергаем нулевую гипотезу')

Ttest_indResult(statistic=4.3087588262138725, pvalue=2.067249157283479e-05)
Отвергаем нулевую гипотезу, критики по-разному оценивают игры на PC и PS4


In [8]:
df_ps4['Critic_Score'].mean()

72.09126984126983

In [9]:
df_pc['Critic_Score'].mean()

75.92867132867133

В среднем критики оценивают игры по-разному между платформами (статистически это потверждается).
Можем сравнить среднюю оценку по платформам и определить на какой более высоко оценивают игры. Это PC

75.9(PC) > 72.1(PS4)

### 3) Критикам больше нравятся стрелялки или стратегии?

H0: средняя оценка критиков шутеров и стратегий одинаковая  
H1: средняя оценка критиков шутеров и стратегий различается

In [10]:
df_shooter = df_[(df_.Genre == 'Shooter')]
df_strategy = df_[(df_.Genre == 'Strategy')]

In [11]:
result = st.ttest_ind(df_shooter['Critic_Score'], df_strategy['Critic_Score'], equal_var=False)
print(result)

if (result.pvalue < alpha):
    print('Отвергаем нулевую гипотезу, критики по разному оценивают жанры')
else:
    print('Не отвергаем нулевую гипотезу')

Ttest_indResult(statistic=-2.2972408230640315, pvalue=0.021938989522304823)
Отвергаем нулевую гипотезу, критики по разному оценивают жанры


In [12]:
df_shooter['Critic_Score'].mean()

70.18114406779661

In [13]:
df_strategy['Critic_Score'].mean()

72.08609271523179

В среднем критики оценивают игры по-разному между платформами (статистически это потверждается).
Можем сравнить среднюю оценку по жанрам и определить более привлекательный для критиков. Это стратегии

70.2(шутеры) > 72.1(стратегии)

## Задание 2

In [14]:
df_spam = pd.read_csv('spam.csv')
df_spam.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### 1) Приведите весь текст к нижнему регистру

In [15]:
df_spam['Message'] = df_spam['Message'].str.lower()
df_spam.head()

Unnamed: 0,Category,Message
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


### 2) Удалите мусорные символы;

In [16]:
df_spam['only_words'] = df_spam['Message'].map(lambda x: re.sub('[\W_]+',' ', x))
df_spam.head()

Unnamed: 0,Category,Message,only_words
0,ham,"go until jurong point, crazy.. available only ...",go until jurong point crazy available only in ...
1,ham,ok lar... joking wif u oni...,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...,u dun say so early hor u c already then say
4,ham,"nah i don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives arou...


### 3) Удалите стоп-слова;

In [17]:
df_spam['only_words'] = df_spam['only_words'].map(lambda x: x.split())

stopwords_set = set(stopwords.words('english'))
df_spam['without_sw'] = df_spam['only_words'].map(lambda x: [word for word in x if word not in stopwords_set] )
df_spam.head()

Unnamed: 0,Category,Message,only_words,without_sw
0,ham,"go until jurong point, crazy.. available only ...","[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,ok lar... joking wif u oni...,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,u dun say so early hor... u c already then say...,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"nah i don't think he goes to usf, he lives aro...","[nah, i, don, t, think, he, goes, to, usf, he,...","[nah, think, goes, usf, lives, around, though]"


### 4) Приведите все слова к нормальной форме;

In [18]:
wordnet_lemmatizer = WordNetLemmatizer()
df_spam['lemmatized'] = df_spam['without_sw'].map(lambda x: [wordnet_lemmatizer.lemmatize(word) for word in x] )

In [19]:
def to_str(lem):
    return ' '.join([x for x in lem])

In [20]:
df_spam['new_message'] = df_spam['lemmatized'].apply(to_str)
df_spam.head()

Unnamed: 0,Category,Message,only_words,without_sw,lemmatized,new_message
0,ham,"go until jurong point, crazy.. available only ...","[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazy, available, bugis, n...",go jurong point crazy available bugis n great ...
1,ham,ok lar... joking wif u oni...,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]",ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entry, 2, wkly, comp, win, fa, cup, fin...",free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say so early hor... u c already then say...,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, early, hor, u, c, already, say]",u dun say early hor u c already say
4,ham,"nah i don't think he goes to usf, he lives aro...","[nah, i, don, t, think, he, goes, to, usf, he,...","[nah, think, goes, usf, lives, around, though]","[nah, think, go, usf, life, around, though]",nah think go usf life around though


### 5) Преобразуйте все сообщения в вектора TF-IDF;

In [21]:
df_redacted = df_spam.drop(columns=['Message', 'only_words', 'without_sw', 'lemmatized'])

In [22]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df_redacted['new_message'])

names = tfidf.get_feature_names()
matrix = pd.DataFrame(tfidf_matrix.toarray(), columns=names)
matrix.head()

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1,〨ud
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 6) Разделите данные на тестовые и тренировочные в соотношении 30/70, укажите random_state=42;

In [23]:
df_redacted['class_message'] = (df_redacted['Category'] == 'spam')* 1
df_redacted.head()

Unnamed: 0,Category,new_message,class_message
0,ham,go jurong point crazy available bugis n great ...,0
1,ham,ok lar joking wif u oni,0
2,spam,free entry 2 wkly comp win fa cup final tkts 2...,1
3,ham,u dun say early hor u c already say,0
4,ham,nah think go usf life around though,0


In [24]:
X_train, X_test, y_train, y_test = train_test_split(matrix, df_redacted['class_message'], test_size=0.30, random_state=42)

### 7) Постройте модель логистической регрессии, укажите random_state=42, оцените ее точность на тестовых данных;

In [25]:
lda = LinearDiscriminantAnalysis()

In [26]:
lda.fit(X_train, y_train)

LinearDiscriminantAnalysis()

In [27]:
accuracy_score(y_test, lda.predict(X_test))

0.9677033492822966

Точность порядка 97%

### 8) Опишите результаты при помощи confusion_matrix;

In [28]:
confusion_matrix(y_test, lda.predict(X_test))

array([[1445,    3],
       [  51,  173]], dtype=int64)

Предсказания: <br>
 верные: spam - 1445, ham - 173<br>
 неверные: 54

### 9) Постройте датафрейм, который будет содержать все исходные тексты сообщений, классифицированные неправильно (с указанием фактического и предсказанного).

In [32]:
df_wrong = pd.concat([df_redacted[df_redacted.index.isin(y_test.index)].reset_index(), 
                    pd.Series(lda.predict(X_test), np.arange(len(lda.predict(X_test))),
                    name = 'predict')], axis=1)

df_wrong = df_wrong[df_wrong['class_message'] != df_wrong['predict']]
df_wrong

Unnamed: 0,index,Category,new_message,class_message,predict
0,8,spam,winner valued network customer selected receiv...,1,0
1,12,spam,urgent 1 week free membership 100 000 prize ja...,1,0
2,15,spam,xxxmobilemovieclub use credit click wap link n...,1,0
4,19,spam,england v macedonia dont miss goal team news t...,1,0
14,47,ham,fair enough anything going,0,1
...,...,...,...,...,...
1633,5446,ham,back good journey let know need receipt shall ...,0,1
1634,5450,ham,sac need carry,0,1
1638,5457,ham,arun u transfr amt,0,1
1657,5524,spam,awarded sipix digital camera call 09061221061 ...,1,0
