In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from feature_engine.encoding import OrdinalEncoder, OneHotEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.preprocessing import FunctionTransformer

from sklearn import set_config
set_config(display='diagram')

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html

In [2]:
df = pd.read_csv('training_tweets.csv', index_col = 0)
df

Unnamed: 0,content,sentiment
34723,Happy Mama's day to all mothers,love
17493,@LysdelTellez I am lost. Please help me find a...,worry
20198,"@BoomKatt yes yes I AM, networking whore to th...",happiness
6855,you@snapplynn Wish that would have been your t...,neutral
5924,now i am doing the MicroEconomics project iha...,worry
...,...,...
16469,I do not want to work tomorrow!,sadness
36006,@KandyBee we shuld do a dance like that its s...,fun
22647,"Photo: Got my prints a few days ago, ready for...",happiness
21478,@tove_liden Thanks for the follow Tove!,fun


In [3]:
import nltk
nltk.download('stopwords', download_dir='.')

[nltk_data] Downloading package stopwords to ....
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
pd.DataFrame(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 34723 to 39364
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   content    30000 non-null  object
 1   sentiment  30000 non-null  object
dtypes: object(2)
memory usage: 703.1+ KB


In [5]:
import nltk
nltk.download('words')
from nltk.corpus import words
len(words.words())



[nltk_data] Downloading package words to
[nltk_data]     C:\Users\tomas\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


236736

In [6]:
import nltk
nltk.download('words')
from nltk.corpus import stopwords
len(stopwords.words('english'))


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\tomas\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


179

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()

X = vec.fit_transform(df)
vec.get_feature_names_out()
X

<2x2 sparse matrix of type '<class 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [8]:
df

Unnamed: 0,content,sentiment
34723,Happy Mama's day to all mothers,love
17493,@LysdelTellez I am lost. Please help me find a...,worry
20198,"@BoomKatt yes yes I AM, networking whore to th...",happiness
6855,you@snapplynn Wish that would have been your t...,neutral
5924,now i am doing the MicroEconomics project iha...,worry
...,...,...
16469,I do not want to work tomorrow!,sadness
36006,@KandyBee we shuld do a dance like that its s...,fun
22647,"Photo: Got my prints a few days ago, ready for...",happiness
21478,@tove_liden Thanks for the follow Tove!,fun


In [9]:
try:
    nltk.data.find('./corpora/stopwords')
    print('pase')
except LookupError:
    nltk.download('words',download_dir='.')
    print('descargue')
stopwords.words('english');


pase


In [10]:
df.content.name

'content'

In [11]:
import sys

sys.path.append('..')
from utils import RemoveStopWords

rsw = RemoveStopWords(columns = ['content'])
df = rsw.transform(df)

In [12]:
from utils import FeatureExtractionTwitts

fet = FeatureExtractionTwitts(twit_text_column="content_min", features_to_extract = ["arrobas_count", "hashtag_count", "is_reply"])
df = fet.transform(df)
df.arrobas_count.value_counts()

0     15681
1     13635
2       558
3        76
4        28
5        13
6         5
9         1
10        1
7         1
8         1
Name: arrobas_count, dtype: int64

In [13]:
df.hashtag_count.value_counts()

0    29379
1      538
2       63
3       14
9        3
5        1
7        1
4        1
Name: hashtag_count, dtype: int64

In [14]:
df.is_reply.value_counts()

0    16512
1    13488
Name: is_reply, dtype: int64

In [35]:
from nltk.tokenize import sent_tokenize, word_tokenize
def stemSentence(sentence, stemmer):
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(stemmer(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)


import nltk
try:
    nltk.data.find('corpora/wordnet.zip')
    nltk.data.find('corpora/omw-1.4.zip/omw-1.4/')
except LookupError:
    nltk.download('wordnet')
    nltk.download('omw-1.4')
lemma = nltk.wordnet.WordNetLemmatizer()
sno = nltk.stem.SnowballStemmer('english')
ps = nltk.stem.PorterStemmer()
fd = pd.DataFrame([], columns=['initial', 'lemma', 'ps', 'sno'])
display(fd)
c=0
for sentence in df.content_min:
    sno_word = stemSentence(sentence, sno.stem)
    ps_word = stemSentence(sentence, ps.stem)
    lemma_word = stemSentence(sentence, lemma.lemmatize)
    # sno_word = sno.stem(word)
    # ps_word = ps.stem(word)
    # lemma_word = lemma.lemmatize(word)
    if sentence != sno_word or sentence != lemma_word or sentence != ps_word:
        fd = pd.concat([fd,pd.DataFrame([[sentence, lemma_word, ps_word, sno_word]], index=[c], columns=['initial', 'lemma', 'ps', 'sno'])])
        c+=1
        # if c > 10000:
        #     break;
fd


# Nota: creo que deberíamos aplicar lemma/stem primero y despues stopwords.

Unnamed: 0,initial,lemma,ps,sno


Unnamed: 0,initial,lemma,ps,sno
0,happy mama's day mothers,happy mama 's day mother,happi mama 's day mother,happi mama 's day mother
1,@lysdeltellez i lost. please help find good home.,@ lysdeltellez i lost . please help find good ...,@ lysdeltellez i lost . pleas help find good h...,@ lysdeltellez i lost . pleas help find good h...
2,"@boomkatt yes yes i am, networking whore fulle...","@ boomkatt yes yes i am , networking whore ful...","@ boomkatt ye ye i am , network whore fullest ...","@ boomkatt yes yes i am , network whore fulles..."
3,you@snapplynn wish would tweet followed me.,you @ snapplynn wish would tweet followed me .,you @ snapplynn wish would tweet follow me .,you @ snapplynn wish would tweet follow me .
4,microeconomics project ihate subject &amp; be...,microeconomics project ihate subject & amp ; b...,microeconom project ihat subject & amp ; besid...,microeconom project ihat subject & amp ; besid...
...,...,...,...,...
29993,i want work tomorrow!,i want work tomorrow !,i want work tomorrow !,i want work tomorrow !
29994,@kandybee shuld dance like seriously best thi...,@ kandybee shuld dance like seriously best thi...,@ kandybe shuld danc like serious best thing h...,@ kandybe shuld danc like serious best thing h...
29995,"photo: got prints days ago, ready norskart exh...","photo : got print day ago , ready norskart exh...","photo : got print day ago , readi norskart exh...","photo : got print day ago , readi norskart exh..."
29996,@tove_liden thanks follow tove!,@ tove_liden thanks follow tove !,@ tove_liden thank follow tove !,@ tove_liden thank follow tove !
