# Datos de Twitter para análisis de sentimiento y COLCAP

## Paquetes de Python para procesamiento y análisis de texto

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from joblib import dump, load

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     D:\Usuarios\rhaps\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     D:\Usuarios\rhaps\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Cargar bases de datos

In [3]:
data = pd.read_csv('datasets/MUESTRA.csv', encoding = "ISO-8859-1", sep=";")
data.head()

Unnamed: 0,Platform,Date & Time (GMT-0500),User,Name,Bio,Followers,Engagements,Location,Sentiment,URL,Post Text,Original Post (Twt only),CLASIFICACION,Unnamed: 13
0,Twitter,25/05/2018 9:18,http://twitter.com/RedDeApoyoSIC,RED DE APOYO S.I.C,sá´á´á´sÂ á´É´Â É¢Êá´á´á´Â ð¥Â ÉªÉ´Ò...,5587,,CO,Neutral,http://twitter.com/RedDeApoyoSIC/statuses/1000...,Indicadores EconÃ³micos ððð ðµ DÃ³...,,1.0,
1,Twitter,16/09/2016 9:30,http://twitter.com/fdez_adriana,Adriana FernÃ¡ndez,Profesional en Comercio Exterior.Asesora en Im...,357,,Colombia,Neutral,http://twitter.com/fdez_adriana/statuses/77677...,RT @revistatym: #BuenViernes los #IndicadoresE...,#BuenViernes los #IndicadoresEconÃ³micos del d...,1.0,
2,Twitter,26/08/2016 8:04,http://twitter.com/convergenciasas,Convergencia Econ.,"AnÃ¡lisis econÃ³mico con rigurosidad, responsa...",333,,Colombia,Neutral,http://twitter.com/convergenciasas/statuses/76...,RT @Asecarga: #BuenViernes los #IndicadoresEco...,#BuenViernes los #IndicadoresEconÃ³micos del d...,1.0,
3,Twitter,25/05/2018 9:38,http://twitter.com/agenciacmalatam,Agencia CMA Latam,Empresa multinacional brasileira que desde 197...,89,,,Neutral,http://twitter.com/agenciacmalatam/statuses/10...,MERCADO COLOMBIA: Colcap abre con caÃ­da del 0...,,-1.0,
4,Twitter,15/07/2016 7:45,http://twitter.com/convergenciasas,Convergencia Econ.,"AnÃ¡lisis econÃ³mico con rigurosidad, responsa...",333,,Colombia,Neutral,http://twitter.com/convergenciasas/statuses/75...,RT @Asecarga: #BuenViernes los #IndicadoresEco...,#BuenViernes los #IndicadoresEconÃ³micos para ...,1.0,


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43003 entries, 0 to 43002
Data columns (total 14 columns):
Platform                    33634 non-null object
Date & Time (GMT-0500)      33634 non-null object
User                        33634 non-null object
Name                        33633 non-null object
Bio                         29743 non-null object
Followers                   32214 non-null object
Engagements                 816 non-null float64
Location                    23240 non-null object
Sentiment                   31883 non-null object
URL                         31883 non-null object
Post Text                   31883 non-null object
Original Post (Twt only)    12254 non-null object
CLASIFICACION               33634 non-null float64
Unnamed: 13                 10 non-null float64
dtypes: float64(3), object(11)
memory usage: 4.6+ MB


In [5]:
data.loc[:,['Post Text', 'Original Post (Twt only)', 'CLASIFICACION']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43003 entries, 0 to 43002
Data columns (total 3 columns):
Post Text                   31883 non-null object
Original Post (Twt only)    12254 non-null object
CLASIFICACION               33634 non-null float64
dtypes: float64(1), object(2)
memory usage: 1008.0+ KB


In [6]:
data = data.loc[:,['Post Text', 'Original Post (Twt only)', 'CLASIFICACION']].copy()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43003 entries, 0 to 43002
Data columns (total 3 columns):
Post Text                   31883 non-null object
Original Post (Twt only)    12254 non-null object
CLASIFICACION               33634 non-null float64
dtypes: float64(1), object(2)
memory usage: 1008.0+ KB


De los datos contamos con un total de 31883 de tweets con texto

In [7]:
data[data.iloc[:,0].isnull()].shape

(11120, 3)

y se pierden 9369 de tweets 

In [8]:
data_clean = data[data.iloc[:, 0].notnull()].copy()

In [9]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31883 entries, 0 to 41645
Data columns (total 3 columns):
Post Text                   31883 non-null object
Original Post (Twt only)    12254 non-null object
CLASIFICACION               31883 non-null float64
dtypes: float64(1), object(2)
memory usage: 996.3+ KB


## Proporción de las cantidades de etiquetado

In [10]:
data_clean.CLASIFICACION.value_counts()

 1.0    14705
-1.0    13269
 0.0     3909
Name: CLASIFICACION, dtype: int64

In [11]:
data_clean.CLASIFICACION.value_counts() /31883

 1.0    0.461218
-1.0    0.416178
 0.0    0.122605
Name: CLASIFICACION, dtype: float64

Podemos ver que contamos con una cantidad aproximada de 46% de tweets positivos, 41% para negativos y 12% para neutrales

In [12]:
X = data_clean.loc[:,'Post Text'].copy()
y = data_clean.CLASIFICACION.copy()

Utilizamos un enfoque hold-out donde particionaremos nuestros datos en dos conjuntos, uno para el entrenamiento (80%) de los modelos y el restante para el testeo de estos. 

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345, stratify=y)

In [14]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

## Bag of words 

In [15]:
print(len(vectorizer.vocabulary_))
vectorizer.vocabulary_

9168


{'bolsa': 3122,
 'da': 3749,
 'colã': 3473,
 'mbia': 6005,
 'colcap': 3453,
 'fecha': 4472,
 'em': 4168,
 'baixa': 2996,
 'de': 3786,
 '18': 687,
 'bogotã': 3117,
 'fev': 4497,
 'efe': 4117,
 'ndice': 6292,
 'capitalizaã': 3296,
 'https': 5019,
 'co': 3445,
 'xlw84o10m9': 8836,
 '1133': 240,
 '68': 1975,
 'rt': 7553,
 'larepublica_co': 5705,
 'estos': 4320,
 'son': 7831,
 'los': 5859,
 'indicadoreslr': 5201,
 'del': 3808,
 'dã': 4038,
 'dã³lar': 4040,
 '924': 2378,
 '80': 2181,
 'euro': 4353,
 '371': 1374,
 '38': 1390,
 '343': 1286,
 '44': 1542,
 'petrã³leo': 6808,
 'us': 8367,
 '50': 1687,
 'cafâ': 3249,
 'update': 8348,
 'dowjones': 3977,
 '94': 2401,
 'nasdaq': 6267,
 '04': 51,
 'sp500': 7846,
 '95': 2412,
 'ibovespa': 5086,
 '16': 647,
 'ipc': 5306,
 '86': 2236,
 'ipsa': 5309,
 '67': 1967,
 'merval': 6052,
 '46': 1564,
 '72': 2064,
 'buenos': 3185,
 'as': 2871,
 'indicadoreseconã³micos': 5191,
 'para': 6698,
 'hoy': 5003,
 '942': 2404,
 '13': 418,
 '337': 1245,
 '25': 866,
 '333': 

In [30]:
# guardemos el diccionario de palabras
dump(vectorizer, 'bag_words_colcap.joblib') 

['bag_words_colcap.joblib']

In [16]:
X_train_features = vectorizer.transform(X_train)
X_test_features = vectorizer.transform(X_test)
type(X_train_features)

scipy.sparse.csr.csr_matrix

In [17]:
primera_palabra = 2500
primera_reseña = 10
palabras = vectorizer.get_feature_names()[primera_palabra:primera_palabra+10]
pd.DataFrame(X_train_features[primera_reseña:primera_reseña+7, primera_palabra:primera_palabra+10].todense(), columns=palabras)

Unnamed: 0,9p2vpsffuo,9qbst36xla,9qu1zeqvnm,9r6hlttrts,9rkeqmfuyi,9t9kdqlyen,9tzs2whoo2,9u6lz6dekg,9uamruu7gx,9uoxq9aqqq
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0


In [18]:
acc_test_vec=[]
alpha_vec= [100, 50, 10, 5, 1, 0.5, 0.4, 0.3, 0.2, 0.1, 0.09, 0.08, 0.07, 0.05, 0.01, 0.005, 0.001]
for alpha in alpha_vec:
    modelo = MultinomialNB(alpha=alpha)
    modelo.fit(X_train_features, y_train)
    y_pred = modelo.predict(X_test_features)
    acc_test_vec.append(metrics.accuracy_score(y_test, y_pred))
alpha_optimo = alpha_vec[np.argmax(acc_test_vec)]
acc_optima= max(acc_test_vec)
print("El óptimo se obtiene con alpha=%f, que produce un %.3f%% de exactitud" %(alpha_optimo, 100*acc_optima))

El óptimo se obtiene con alpha=5.000000, que produce un 81.453% de exactitud


In [19]:
modelo = MultinomialNB(alpha=0.5)
modelo.fit(X_train_features, y_train)

MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True)

## matriz de confusión 

In [20]:
target_names = ['positivos', 'neutrales', 'negativos']
y_pred = modelo.predict(X_test_features)
cm= metrics.confusion_matrix(y_test, y_pred)
cm

array([[3283,  280,  418],
       [  49,  994,  130],
       [ 637,  337, 3437]], dtype=int64)

In [21]:
target_names = ['positivos', 'neutrales', 'negativos']
print(classification_report(y_test, y_pred, target_names=target_names))

             precision    recall  f1-score   support

  positivos       0.83      0.82      0.83      3981
  neutrales       0.62      0.85      0.71      1173
  negativos       0.86      0.78      0.82      4411

avg / total       0.82      0.81      0.81      9565



In [22]:
print("Exactitud: ", metrics.accuracy_score(y_test, y_pred))
print("Kappa    : ", metrics.cohen_kappa_score(y_test, y_pred))

Exactitud:  0.8064819654992159
Kappa    :  0.6850859262834035


_El mejor modelo da una exactitud del 80%_

## Guardar el mejor modelo

In [24]:
#we are going to save the model 
dump(modelo, 'colcab_sentiment_model.joblib') 

['colcab_sentiment_model.joblib']

## Análisis de los datos sin etiquetado

### Datos Keyhole_COLCAP(scLM0a) 0101201601312018

In [44]:
# lets load and prove it again
modelo = load('colcab_sentiment_model.joblib')
vectorizer = load('bag_words_colcap.joblib')
data_test2018 = pd.read_csv('test/COLCAP2018.csv', encoding = "ISO-8859-1", sep=",")
data_test2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82959 entries, 0 to 82958
Data columns (total 12 columns):
Platform                    82959 non-null object
Date & Time (GMT-0500)      82959 non-null object
User                        82959 non-null object
Name                        82959 non-null object
Bio                         74167 non-null object
Followers                   82959 non-null int64
Engagements                 2060 non-null float64
Location                    60940 non-null object
Sentiment                   82959 non-null object
URL                         82959 non-null object
Post Text                   82959 non-null object
Original Post (Twt only)    28100 non-null object
dtypes: float64(1), int64(1), object(10)
memory usage: 7.6+ MB


In [45]:
X_test_2018 = vectorizer.transform(data_test2018.loc[:,'Post Text'])
modelo.predict(X_test_2018)

array([ 0.,  0.,  0., ...,  0.,  0., -1.])

In [46]:
serie  = pd.Series(modelo.predict(X_test_2018))  
data_test2018 = data_test2018.assign(Prediccion=serie).copy()

In [47]:
data_test2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82959 entries, 0 to 82958
Data columns (total 13 columns):
Platform                    82959 non-null object
Date & Time (GMT-0500)      82959 non-null object
User                        82959 non-null object
Name                        82959 non-null object
Bio                         74167 non-null object
Followers                   82959 non-null int64
Engagements                 2060 non-null float64
Location                    60940 non-null object
Sentiment                   82959 non-null object
URL                         82959 non-null object
Post Text                   82959 non-null object
Original Post (Twt only)    28100 non-null object
Prediccion                  82959 non-null float64
dtypes: float64(2), int64(1), object(10)
memory usage: 8.2+ MB


In [48]:
data_test2018.to_csv("results/Datos Keyhole_COLCAP(scLM0a) 0101201601312018.csv", sep=';', encoding='utf-8')

### Datos Keyhole_COLCAP(scLM0a) 0101201604032019

In [49]:
# lets load and prove it again
modelo = load('colcab_sentiment_model.joblib')
vectorizer = load('bag_words_colcap.joblib')
data_test2019 = pd.read_csv('test/COLCAP2019.csv', encoding = "ISO-8859-1", sep=",")
data_test2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84903 entries, 0 to 84902
Data columns (total 12 columns):
Platform                    84903 non-null object
Date & Time (GMT-0500)      84903 non-null object
User                        84903 non-null object
Name                        84903 non-null object
Bio                         75848 non-null object
Followers                   84903 non-null int64
Engagements                 2366 non-null float64
Location                    62214 non-null object
Sentiment                   84903 non-null object
URL                         84903 non-null object
Post Text                   84903 non-null object
Original Post (Twt only)    28813 non-null object
dtypes: float64(1), int64(1), object(10)
memory usage: 7.8+ MB


In [50]:
X_test_2019 = vectorizer.transform(data_test2019.loc[:,'Post Text'])
serie  = pd.Series(modelo.predict(X_test_2019))  
data_test2019 = data_test2019.assign(Prediccion=serie).copy()
data_test2019.to_csv("results/Keyhole_COLCAP(scLM0a) 0101201604032019.csv", sep=';', encoding='utf-8')

### Datos Keyhole_COLCAP(scLM0a) 0101201628022019

In [51]:
# lets load and prove it again
modelo = load('colcab_sentiment_model.joblib')
vectorizer = load('bag_words_colcap.joblib')
data_test2019_2 = pd.read_csv('test/COLCAP22019.csv', encoding = "ISO-8859-1", sep=",")
data_test2019_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84816 entries, 0 to 84815
Data columns (total 12 columns):
Platform                    84816 non-null object
Date & Time (GMT-0500)      84816 non-null object
User                        84816 non-null object
Name                        84816 non-null object
Bio                         75769 non-null object
Followers                   84816 non-null int64
Engagements                 2353 non-null float64
Location                    62152 non-null object
Sentiment                   84816 non-null object
URL                         84816 non-null object
Post Text                   84816 non-null object
Original Post (Twt only)    28775 non-null object
dtypes: float64(1), int64(1), object(10)
memory usage: 7.8+ MB


In [52]:
X_test_2019_2 = vectorizer.transform(data_test2019_2.loc[:,'Post Text'])
serie  = pd.Series(modelo.predict(X_test_2019_2))  
data_test2019_2 = data_test2019_2.assign(Prediccion=serie).copy()
data_test2019_2.to_csv("results/Keyhole_COLCAP(scLM0a) 0101201628022019.csv", sep=';', encoding='utf-8')