Тема “Создание признакового пространства”

Продолжим обработку данных с Твиттера. 

1. Создайте мешок слов с помощью sklearn.feature_extraction.text.CountVectorizer.fit_transform(). Применим его к 'tweet_stemmed' и 'tweet_lemmatized' отдельно.
Игнорируем слова, частота которых в документе строго превышает порог 0.9 с помощью max_df.
Ограничим количество слов, попадающий в мешок, с помощью max_features = 1000.
Исключим стоп-слова с помощью stop_words='english'. 
Отобразим Bag-of-Words модель как DataFrame. columns необходимо извлечь с помощью CountVectorizer.get_feature_names().
 
2. Создайте мешок слов с помощью sklearn.feature_extraction.text.TfidfVectorizer.fit_transform(). Применим его к 'tweet_stemmed' и 'tweet_lemmatized' отдельно.
Игнорируем слова, частота которых в документе строго превышает порог 0.9 с помощью max_df.
Ограничим количество слов, попадающий в мешок, с помощью max_features = 1000.
Исключим стоп-слова с помощью stop_words='english'.
Отобразим Bag-of-Words модель как DataFrame. columns необходимо извлечь с помощью TfidfVectorizer.get_feature_names().
 
3. Проверьте ваши векторайзеры на корпусе который использовали на вебинаре, составьте таблицу метод векторизации и скор который вы получили (в методах векторизации по изменяйте параметры что бы добиться лучшего скора) обратите внимание как падает/растёт скор при уменьшении количества фичей, и изменении параметров, так же попробуйте применить к векторайзерам PCA для сокращения размерности посмотрите на качество сделайте выводы


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import model_selection, preprocessing, linear_model
from sklearn.metrics import accuracy_score

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [3]:
with open('/gdrive/MyDrive/nero_final/combine_df.pickle', 'rb') as f:
    combine_df = pickle.load(f)

In [4]:
combine_df.head(2)

Unnamed: 0,id,label,tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunct, selfish, drag, kid, dysfunc..."
1,2,0.0,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thank, lyft, credit, use, cau, offer, wheelch...","[thank, lyft, credit, use, cau, offer, wheelch...","[thank, lyft, credit, use, cau, offer, wheelch..."


In [5]:
class preprocessing_model():
    
    def __init__(self, df, cols_list, y_name):
        
        for col in cols_list:
            df[col] = df[col].apply(lambda x: ' '.join(x))

        for col in list(df.select_dtypes(include=['bool']).columns):
            if not col is cols_list and col != y_name:
                df = df.drop(col, 1)
            
        self.df = df
        self.cols_list = cols_list
        self._y_name = y_name 
        self._vectorizer_list = ['CountVectorizer', 'TfidfVectorizer']
        self.df_result = pd.DataFrame(columns=['column', 'vectorizer', 'max_df', 'max_features', 'accuracy_score'])
        
    def results_predictions(self):
        return self.df_result
    
    def BagOfWords(self, VectorizerName, max_df = 0.9, max_features = 1000, fit_transform = True):
            
        dict_results = dict()
        
        for col in self.cols_list:
            
            vectorizer = None
            
            if VectorizerName == 'CountVectorizer':
                vectorizer = CountVectorizer(max_df = max_df, max_features = max_features, stop_words='english')
            else:
                vectorizer = TfidfVectorizer(max_df = max_df, max_features = max_features, stop_words='english')
            
            if fit_transform:
                
                # Создаем the Bag-of-Words модель
                bag_of_words = vectorizer.fit_transform(self.df[col].tolist())

                # Отобразим Bag-of-Words модель как DataFrame
                feature_names = vectorizer.get_feature_names()
                dict_results[col] = pd.DataFrame(bag_of_words.toarray(), columns = feature_names)
                
            else:
                vectorizer.fit(self.df[col])
                dict_results[col] = vectorizer 
                
        return dict_results
        
    def predictions_result(self, max_df = 0.9, max_features = 1000):
    
        for vectorizer in self._vectorizer_list:
            
            dict_results = self.BagOfWords(vectorizer, max_df = max_df, max_features = max_features,fit_transform = False)
            
            for col in dict_results.keys():
            
                train_x, valid_x, train_y, valid_y = model_selection.train_test_split(self.df[col], self.df[self._y_name])

                train_y[np.isnan(train_y)] = 0
                valid_y[np.isnan(valid_y)] = 0

                xtrain_count =  dict_results[col].transform(train_x)
                xvalid_count =  dict_results[col].transform(valid_x)

                classifier = linear_model.LogisticRegression(random_state=0)
                classifier.fit(xtrain_count[:10000], train_y[:10000])
                predictions = classifier.predict(xvalid_count)

                self.df_result.loc[len(self.df_result)] = [col, vectorizer, max_df, max_features, accuracy_score(valid_y, predictions)]
                
    def creat_pull_predictions(self, max_df_list, max_features_list):
        
        for max_df in max_df_list:
            for max_features in max_features_list:
                self.predictions_result(max_df, max_features)
                

In [6]:
preprocessing_model = preprocessing_model(combine_df,['tweet_stemmed', 'tweet_lemmatized'], 'label')

In [7]:
dict_BagOfWords = preprocessing_model.BagOfWords('CountVectorizer')

In [8]:
dict_BagOfWords['tweet_stemmed'].head(2)

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,add,adventur,affirm,afternoon,age,ago,agr,ahead,aist,album,aliv,allahsoil,allow,alon,alreadi,altwaystoh,alway,amaz,america,american,amp,angel,anger,angri,anim,anniversari,announc,anoth,answer,anti,...,went,wet,whatev,white,wife,wild,win,wine,winner,wish,woh,woman,women,wonder,wor,word,work,workout,world,worri,worst,wow,write,wrong,xx,xxx,ya,yay,ye,yeah,year,yesterday,yo,yoga,york,young,youtub,yoyou,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
dict_BagOfWords['tweet_lemmatized'].head(2)

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,add,adventur,affirm,afternoon,age,ago,agr,ahead,aist,album,aliv,allahsoil,allow,alon,alreadi,altwaystoh,alway,amaz,america,american,amp,angel,anger,angri,anim,anniversari,announc,anoth,answer,anti,...,went,wet,whatev,white,wife,wild,win,wine,winner,wish,woh,woman,women,wonder,wor,word,work,workout,world,worri,worst,wow,write,wrong,xx,xxx,ya,yay,ye,yeah,year,yesterday,yo,yoga,york,young,youtub,yoyou,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
dict_BagOfWords= preprocessing_model.BagOfWords('TfidfVectorizer')

In [11]:
dict_BagOfWords['tweet_stemmed'].head(2)

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,add,adventur,affirm,afternoon,age,ago,agr,ahead,aist,album,aliv,allahsoil,allow,alon,alreadi,altwaystoh,alway,amaz,america,american,amp,angel,anger,angri,anim,anniversari,announc,anoth,answer,anti,...,went,wet,whatev,white,wife,wild,win,wine,winner,wish,woh,woman,women,wonder,wor,word,work,workout,world,worri,worst,wow,write,wrong,xx,xxx,ya,yay,ye,yeah,year,yesterday,yo,yoga,york,young,youtub,yoyou,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
dict_BagOfWords['tweet_lemmatized'].head(2)

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,add,adventur,affirm,afternoon,age,ago,agr,ahead,aist,album,aliv,allahsoil,allow,alon,alreadi,altwaystoh,alway,amaz,america,american,amp,angel,anger,angri,anim,anniversari,announc,anoth,answer,anti,...,went,wet,whatev,white,wife,wild,win,wine,winner,wish,woh,woman,women,wonder,wor,word,work,workout,world,worri,worst,wow,write,wrong,xx,xxx,ya,yay,ye,yeah,year,yesterday,yo,yoga,york,young,youtub,yoyou,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
preprocessing_model.creat_pull_predictions([0.9,0.8,0.5],[200,500,1000])

In [14]:
preprocessing_model.results_predictions()

Unnamed: 0,column,vectorizer,max_df,max_features,accuracy_score
0,tweet_stemmed,CountVectorizer,0.9,200,0.955899
1,tweet_lemmatized,CountVectorizer,0.9,200,0.954434
2,tweet_stemmed,TfidfVectorizer,0.9,200,0.9524
3,tweet_lemmatized,TfidfVectorizer,0.9,200,0.953702
4,tweet_stemmed,CountVectorizer,0.9,500,0.958015
5,tweet_lemmatized,CountVectorizer,0.9,500,0.95598
6,tweet_stemmed,TfidfVectorizer,0.9,500,0.954028
7,tweet_lemmatized,TfidfVectorizer,0.9,500,0.956306
8,tweet_stemmed,CountVectorizer,0.9,1000,0.958584
9,tweet_lemmatized,CountVectorizer,0.9,1000,0.956143
