In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from feature_engine.encoding import OrdinalEncoder, OneHotEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.preprocessing import FunctionTransformer

from sklearn import set_config
set_config(display='diagram')
# ------------------------------------------
import nltk
import sys
import random

from sklearn.feature_extraction.text import CountVectorizer

sys.path.append('..')
import utils
from sklearn.preprocessing import FunctionTransformer

## PROPIAS
from utils import RemoveStopWords
from utils import FeatureExtractionTwitts
from utils import LemmantizerTransformer




https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html

In [None]:
df = pd.read_csv('training_tweets.csv', index_col = 0)
df;

In [None]:
groups_classes = {
                    'happiness' : 'positiva',
                    'surprise'  : 'positiva',
                    'love'      : 'positiva',
                    'fun'       : 'positiva',
                    'relief'    : 'positiva',
                    'enthusiasm': 'positiva',
                    'worry'     : 'negativa',
                    'hate'      : 'negativa',
                    'sadness'   : 'negativa',
                    'empty'     : 'negativa',
                    'boredom'   : 'negativa',
                    'anger'     : 'negativa',
                    'neutral'   : 'neutral'
                }
target_var_name = 'sentiment'
neutral_class_name = 'neutral'

new_ordered_columns = df.columns[df.columns.str.contains('content')].to_list() + df.columns[df.columns.str.contains('var')].to_list() + df.columns[df.columns.str.contains('sentiment')].to_list()


In [None]:
df_preprocess = Pipeline(steps=[
                ('regroup_classes', 
                    FunctionTransformer(
                        utils.multi_class_remapping, 
                        kw_args={
                            'group_classes': groups_classes,
                            'var_name': target_var_name,
                            'neutral_class': neutral_class_name,
                            'random_state': 42
                            }))
])

df = df_preprocess.transform(df) # este preprocess del df es solo para remapear las clases del target. Preguntar a Alfonso como incluir esta etapa en el pipeline completo.

preprocessing = Pipeline(steps=[
                ('lt', LemmantizerTransformer(text_columns = ['content'], stemmers=['ps'])),
                ('rsw', RemoveStopWords(text_columns = ['content_ps'])),
                ('reorder', FunctionTransformer(utils.columns_reorder, kw_args={'new_columns_ordered': new_ordered_columns})),
                # 
])

feature_extraction = Pipeline(steps=[
                ('fet', FeatureExtractionTwitts(
                    text_column="content_ps_min",
                    features_to_extract = ["arrobas_count", "hashtag_count", "is_reply"]
                    ))
])


In [None]:
pipe = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('feature_extraction', feature_extraction),
])

X = df[['content']]
y = df.sentiment_remapped
X_tr = pipe.fit_transform(X,y)

In [98]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer


cvec = TfidfVectorizer()
text_preprocessing = Pipeline([('Vect', cvec)])
preproc = ColumnTransformer([('text_preprocessing', text_preprocessing, 'content_ps_min')])
Sa = preproc.fit_transform(X_tr)


ss = pd.DataFrame(Sa.todense().astype('int8'))
ss
# ss[34662].value_counts()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34662,34663,34664,34665,34666,34667,34668,34669,34670,34671
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [75]:
tf = TfidfVectorizer()
string = 'Ah! Now I have done Philosophy,\nI have finished Law and Medicine,\nAnd sadly even Theology:\nTaken fierce pains, from end to end.\nNow here I am, a fool for sure!\nNo wiser than I was before:'
pd.DataFrame(string.split('\n'))


Unnamed: 0,0
0,"Ah! Now I have done Philosophy,"
1,"I have finished Law and Medicine,"
2,And sadly even Theology:
3,"Taken fierce pains, from end to end."
4,"Now here I am, a fool for sure!"
5,No wiser than I was before:


In [None]:

pipece = ColumnTransformer([('tf', tf, ['h'])])
pipece.fit_transform(pd.DataFrame(string.split('\n'), columns = ['h']))


In [49]:
# for i in [0,1,2]:
#     display(cvec.fit_transform(T[i]).shape)

(30000, 179716)

(30000, 169346)

(30000, 155638)