In [1]:
import os
os.chdir("../../")

import pandas as pd
import numpy as np

import spacy
import string
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, cross_validate, GridSearchCV

from imblearn.over_sampling import ADASYN


# Sklearn models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

from src.text.utils import *

In [2]:
st_path = os.getcwd() + "/data/text/solomon_islands/solomon_times_news.csv"
st = pd.read_csv(st_path).drop("Unnamed: 0", axis=1)
st["tag"] = st["tag"].str.lower()
st["label"] = ["economy" in i for i in st["tag"]]

In [3]:
abc_path = os.getcwd() + "/data/text/abc_au/solomon_islands_abc_news.csv"
abc = pd.read_csv(abc_path).drop("Unnamed: 0", axis=1)
abc["tags"] = abc["tags"].fillna("missing").str.lower()
abc["label"] = ["economy" in i or "aid" in i for i in abc["tags"]]

In [4]:
df = pd.concat([abc[["news", "label"]], st[["news", "label"]]]).reset_index(drop=True)

In [5]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
spacy_stopwords = list(nlp.Defaults.stop_words)

nltk_stopwords = stopwords.words("english")
nltk_unique = [sw for sw in nltk_stopwords if sw not in spacy_stopwords]
spacy_stopwords.extend(nltk_unique)

In [6]:
def remove_punctuation(text: str):
    return "".join([i for i in text if i not in string.punctuation])

def remove_stopwords(text: str, stopwords: list):
    return "".join([i for i in text if i not in stopwords])

In [7]:
data = st.news.values.tolist()
data_words = list(sent_to_words(data))

bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)

trigram = gensim.models.Phrases(bigram[data_words], min_count=5, threshold=100)
trigram_mod = gensim.models.phrases.Phraser(trigram)

texts_preprocessed = preprocess_text(data_words, spacy_stopwords, bigram_mod,
                                     trigram_mod, nlp)

Stopwords has been done.


In [8]:
st["texts"] = texts_preprocessed
st["texts"] = st["texts"].apply(lambda x: " ".join(i for i in x))
X_train, X_test, y_train, y_test = train_test_split(
    st['texts'],   
    st['label'], 
    test_size=0.1, 
    random_state=42,  
    shuffle=True)   

In [13]:
from imblearn.combine import SMOTEENN
senn = SMOTEENN(random_state=42)

In [17]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

In [18]:
lr_tfidf = LogisticRegression(solver='liblinear', C=10, penalty='l2')
lr_tfidf.fit(X_train_vectors_tfidf, y_train)

y_predict = lr_tfidf.predict(X_test_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_test_vectors_tfidf)[:, 1]


print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))

fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

              precision    recall  f1-score   support

       False       0.76      0.70      0.73       521
        True       0.75      0.81      0.78       584

    accuracy                           0.76      1105
   macro avg       0.76      0.75      0.75      1105
weighted avg       0.76      0.76      0.75      1105

Confusion Matrix: [[364 157]
 [113 471]]
AUC: 0.8268806036862726


In [20]:
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10, 100]
}

lr = LogisticRegression()
clf = GridSearchCV(lr, param_grid = param_grid, cv=5, scoring="accuracy")
clf.fit(X_train_vectors_tfidf, y_train)
print(clf.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'C': 1, 'penalty': 'l2'}


In [22]:
models = [
    RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=42),
    MultinomialNB(),
    LinearSVC(random_state=42),
    LogisticRegression(solver='liblinear', C=1, penalty='l2', random_state=42),
    XGBClassifier(tree_method="hist", enable_categorical=True, random_state=42)
]

scoring = ['accuracy', 'f1_macro', 'recall_macro', 'precision_macro']

for model in models:
    model_name = model.__class__.__name__
    result = cross_validate(model,
                            X_train_vectors_tfidf,
                            y_train,
                            cv=5,
                            scoring=scoring)
    print(
        "%s: Mean Accuracy = %.2f%%; Mean F1-macro = %.2f%%; Mean recall-macro = %.2f%%; Mean precision-macro = %.2f%%"
        % (model_name, result['test_accuracy'].mean() * 100,
           result['test_f1_macro'].mean() * 100,
           result['test_recall_macro'].mean() * 100,
           result['test_precision_macro'].mean() * 100))

RandomForestClassifier: Mean Accuracy = 62.48%; Mean F1-macro = 53.19%; Mean recall-macro = 59.32%; Mean precision-macro = 77.15%
MultinomialNB: Mean Accuracy = 74.40%; Mean F1-macro = 73.14%; Mean recall-macro = 73.14%; Mean precision-macro = 76.30%




LinearSVC: Mean Accuracy = 75.08%; Mean F1-macro = 74.74%; Mean recall-macro = 74.63%; Mean precision-macro = 75.08%
LogisticRegression: Mean Accuracy = 77.05%; Mean F1-macro = 76.50%; Mean recall-macro = 76.32%; Mean precision-macro = 77.60%
XGBClassifier: Mean Accuracy = 75.53%; Mean F1-macro = 75.05%; Mean recall-macro = 74.90%; Mean precision-macro = 75.79%


## Prediction

In [23]:
target = os.getcwd() + "/data/text/solomon_islands/"
news_path = [filename for filename in os.listdir(target) if "news" in filename and "times" not in filename]

In [24]:
news = pd.DataFrame()
for path in news_path:
    temp = pd.read_csv(target + path).drop("Unnamed: 0", axis=1)
    temp["from"] = path.replace("_news.csv", "")
    if news.empty:
        news = temp
    else: 
        news = pd.concat([news, temp], axis=0)

news = news[news.news.isna() != True].reset_index(drop=True)

In [25]:
data = news.news.values.tolist()
data_words = list(sent_to_words(data))

bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)

trigram = gensim.models.Phrases(bigram[data_words], min_count=5, threshold=100)
trigram_mod = gensim.models.phrases.Phraser(trigram)

texts_preprocessed = preprocess_text(data_words, spacy_stopwords, bigram_mod,
                                     trigram_mod, nlp)

Stopwords has been done.


In [26]:
news["texts"] = texts_preprocessed
news["texts"] = news["texts"].apply(lambda x: " ".join(i for i in x))

In [29]:
X_vectors_tfidf = tfidf_vectorizer.fit_transform(news.texts)
lr_tfidf.predict(X_vectors_tfidf)

ValueError: X has 35383 features, but LogisticRegression is expecting 23826 features as input.

In [None]:
econ_news = news[news.econ_prob >= 0.9].reset_index(drop=True)
st["tag"] = st["tag"].str.lower()
st["label"] = ["economy" in i for i in st["tag"]]
st_econ = st[st.label == True].reset_index(drop=True)
econ_news = pd.concat([st[["date", "news"]], econ_news[["date", "news"]]]).reset_index(drop=True)
econ_news["date"] = pd.to_datetime(econ_news["date"])
econ_news

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sent_df = pd.DataFrame()
for row in econ_news.news:
    sample = row.replace("\n", " ").strip()
    
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(sample)
    ss_temp = pd.DataFrame(ss, columns=ss.keys(), index=[0])
    sent_df = pd.concat([sent_df, ss_temp], axis=0)

In [None]:
econ_news["sentiment"] = sent_df["compound"].tolist()
econ_sent = econ_news.set_index("date").groupby(pd.Grouper(freq="M"))[["sentiment"]].mean().reset_index()

In [None]:
econ_sent.plot(x="date", y="sentiment");