# Тема “Создание признакового пространства”

In [1]:
import pandas as pd
import numpy as np

from sklearn import model_selection, preprocessing, linear_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

Продолжим обработку данных с Твиттера. 

1. Создайте мешок слов с помощью sklearn.feature_extraction.text.CountVectorizer.fit_transform(). Применим его к 'tweet_stemmed' и 'tweet_lemmatized' отдельно.  
    • Игнорируем слова, частота которых в документе строго превышает порог 0.9 с помощью max_df.  
    • Ограничим количество слов, попадающий в мешок, с помощью max_features = 1000.  
    • Исключим стоп-слова с помощью stop_words='english'.  
    • Отобразим Bag-of-Words модель как DataFrame. columns необходимо извлечь с помощью   
CountVectorizer.get_feature_names().

In [2]:
combine_df = pd.read_pickle('data/clean_tweets.pkl')
combine_df.head()

Unnamed: 0,id,label,tweet,tweet_new,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunctional, selfish, drag, kid, dy..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thanks, lyft, credit, use, cause, offer, whee..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesty]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, ur]","[model, love, take, time, ur]","[model, love, take, time, ur]"
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]","[factsguide, society, motivation]"


In [3]:
combine_df['tweet_stemmed'] = combine_df['tweet_stemmed'].apply(lambda x: ' '.join(x))
combine_df['tweet_lemmatized'] = combine_df['tweet_lemmatized'].apply(lambda x: ' '.join(x))

param = {"ngram_range": (1, 1), 
         "binary": True, 
         "max_df": 0.9,
         "max_features": 1000}

In [4]:
def count_vectorizer(data, param):
    
    model = CountVectorizer(analyzer='word', 
                            tokenizer=str.split,
                            stop_words='english',
                            **param)

    # Создаем the Bag-of-Words модель
    BoW = model.fit_transform(data)

    # Отобразим Bag-of-Words модель как DataFrame
    feature_names = model.get_feature_names()
    data = pd.DataFrame(BoW.toarray(), columns = feature_names)
    
    return data

In [5]:
countDF_stem = count_vectorizer(combine_df['tweet_stemmed'], param)
countDF_stem.head(3)

Unnamed: 0,abl,absolut,accept,account,act,action,activ,actor,actual,ad,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
countDF_lem = count_vectorizer(combine_df['tweet_lemmatized'], param)
countDF_lem.head(3)

Unnamed: 0,able,absolutely,account,act,action,actor,actually,adapt,add,adventure,...,year,yes,yesterday,yo,yoga,york,young,youtube,yr,yummy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


2. Создайте мешок слов с помощью sklearn.feature_extraction.text.TfidfVectorizer.fit_transform(). Применим его к 'tweet_stemmed' и 'tweet_lemmatized' отдельно.  
    • Игнорируем слова, частота которых в документе строго превышает порог 0.9 с помощью max_df.  
    • Ограничим количество слов, попадающий в мешок, с помощью max_features = 1000.  
    • Исключим стоп-слова с помощью stop_words='english'.  
    • Отобразим Bag-of-Words модель как DataFrame. columns необходимо извлечь с помощью  
TfidfVectorizer.get_feature_names().

In [7]:
def tfidf_vectorizer(data, param):
    
    model = TfidfVectorizer(analyzer='word', 
                            tokenizer=str.split,
                            stop_words='english',
                            **param)

    # Создаем the Bag-of-Words модель
    BoW = model.fit_transform(data)

    # Отобразим Bag-of-Words модель как DataFrame
    feature_names = model.get_feature_names()
    data = pd.DataFrame(BoW.toarray(), columns = feature_names)
    
    return data

In [8]:
tfidfDF_stem = tfidf_vectorizer(combine_df['tweet_stemmed'], param)
tfidfDF_stem.head(3)

Unnamed: 0,abl,absolut,accept,account,act,action,activ,actor,actual,ad,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
tfidfDF_stem = tfidf_vectorizer(combine_df['tweet_lemmatized'], param)
tfidfDF_stem.head(3)

Unnamed: 0,able,absolutely,account,act,action,actor,actually,adapt,add,adventure,...,year,yes,yesterday,yo,yoga,york,young,youtube,yr,yummy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


3. Проверьте ваши векторайзеры на корпусе который использовали на вебинаре, составьте таблицу метод векторизации и скор который вы получили (в методах векторизации по изменяйте параметры что бы добиться лучшего скора) обратите внимание как падает/растёт скор при уменьшении количества фичей, и изменении параметров, так же попробуйте применить к векторайзерам PCA для сокращения размерности посмотрите на качество сделайте выводы

In [10]:
# Загружаем данные
data = open('data/02_corpus').read()
labels, texts = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split()
    labels.append(content[0])
    texts.append(" ".join(content[1:]))

# создаем df
corpusDF = pd.DataFrame()
corpusDF['text'] = texts
corpusDF['label'] = labels
corpusDF.head(3)

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tra...,__label__2
1,The best soundtrack ever to anything.: I'm rea...,__label__2
2,Amazing!: This soundtrack is my favorite music...,__label__2


In [17]:
dict_param = {"param_base": 
              {"ngram_range": (1, 1), "binary": True, "max_df": 0.9, "max_features": 1000},
              "param_ngram_range": 
              {"ngram_range": (1, 2), "binary": True, "max_df": 0.9, "max_features": 1000},
             "param_binary":
              {"ngram_range": (1, 1), "binary": False, "max_df": 0.9, "max_features": 1000},
             "param_max_df":
              {"ngram_range": (1, 1), "binary": True, "max_df": 1.5, "max_features": 1000},
             "param_max_features": 
              {"ngram_range": (1, 1), "binary": True, "max_df": 0.9, "max_features": 2000}}

stata = pd.DataFrame(columns=['count_vectorizer_accuracy', 'tfidf_vectorizer_accuracy'])

In [18]:
def log_reg(X, y):

    train_x, valid_x, train_y, valid_y = model_selection.train_test_split(X, y, random_state=1)

    classifier = linear_model.LogisticRegression()
    classifier.fit(train_x, train_y)
    pred_y = classifier.predict(valid_x)

    return accuracy_score(valid_y, pred_y)

In [19]:
for k, v in dict_param.items():
    
    countDF_corpus = count_vectorizer(corpusDF['text'], v)
    stata.loc[k, 'count_vectorizer_accuracy'] = log_reg(countDF_corpus, corpusDF['label'])
    
    tfidfDF_corpus = tfidf_vectorizer(corpusDF['text'], v)
    stata.loc[k, 'tfidf_vectorizer_accuracy'] = log_reg(tfidfDF_corpus, corpusDF['label'])

stata

Unnamed: 0,count_vectorizer_accuracy,tfidf_vectorizer_accuracy
param_base,0.8132,0.8236
param_ngram_range,0.812,0.8192
param_binary,0.804,0.8136
param_max_df,0.8132,0.8236
param_max_features,0.8156,0.83


In [24]:
def reduce_dims(df, dims=500):
    
    dim_reducer = PCA(n_components=dims, random_state=42)
    components = dim_reducer.fit_transform(df)
    colnames = [str(i) for i in range(1, dims+1)]
    
    return pd.DataFrame(data = components, columns = colnames) 

In [25]:
countDF_corpus = count_vectorizer(corpusDF['text'], param)
pcaDF = reduce_dims(countDF_corpus)
stata.loc['pca', 'count_vectorizer_accuracy'] = log_reg(pcaDF, corpusDF['label'])
    
tfidfDF_corpus = tfidf_vectorizer(corpusDF['text'], param)
pcaDF = reduce_dims(tfidfDF_corpus)
stata.loc['pca', 'tfidf_vectorizer_accuracy'] = log_reg(pcaDF, corpusDF['label'])

stata

Unnamed: 0,count_vectorizer_accuracy,tfidf_vectorizer_accuracy
param_base,0.8132,0.8236
param_ngram_range,0.812,0.8192
param_binary,0.804,0.8136
param_max_df,0.8132,0.8236
param_max_features,0.8156,0.83
pca,0.8032,0.8176
