In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import random
import nltk
from pipeline import clean, lemmatizeTot, stemmerTot
from tempfile import mkdtemp
from shutil import rmtree
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn import svm
import sklearn.metrics as metrics
import seaborn as sn
import joblib
np.random.seed(42)
random.seed(42)

In [13]:
def classification_metrics(test_labels, pred, multi=False, pos='climate'):
    if multi:
        average = 'macro'
    else:
        average = 'binary'
    acc = metrics.accuracy_score(test_labels, pred)
    recall = metrics.recall_score(test_labels, pred, average=average, pos_label=pos)
    precision = metrics.precision_score(test_labels, pred, average=average, pos_label=pos)
    f1 = metrics.f1_score(test_labels, pred, average=average, pos_label=pos)
    return acc, recall, precision, f1

In [14]:
def plot_roc(test_labels, DecScore, pos_label="climate", title="ROC curve"):
    print("changed")
    fpr, tpr, thres = metrics.roc_curve(test_labels, DecScore, pos_label=pos_label)
    auc = metrics.auc(tpr, fpr,)
    fig = plt.figure()
    plt.plot(tpr, fpr)
    plt.title(title)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.show()
    plt.close()
    return auc

def clean(text, do=True):
    if (not do):
        return text
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    texter = re.sub(r"<br />", " ", text)
    texter = re.sub(r"&quot;", "\"",texter)
    texter = re.sub('&#39;', "\"", texter)
    texter = re.sub('\n', " ", texter)
    texter = re.sub(' u '," you ", texter)
    texter = re.sub('`',"", texter)
    texter = re.sub(' +', ' ', texter)
    texter = re.sub(r"(!)\1+", r"!", texter)
    texter = re.sub(r"(\?)\1+", r"?", texter)
    texter = re.sub('&amp;', 'and', texter)
    texter = re.sub('\r', ' ',texter)
    texter = re.sub(r'[_]+', ' ', texter)
    texter = re.sub(r'[\d]+', '', texter)
    clean = re.compile('<.*?>')
    texter = texter.encode('ascii', 'ignore').decode('ascii')
    texter = re.sub(clean, '', texter)
    if texter == "":
        texter = ""
    return texter



df = pd.read_csv("Project1-Classification.csv")
df = df.replace({'%22forest%20fire%22': 'forest fire'}, regex=True)

In [15]:
from nltk import WordNetLemmatizer, SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english", ignore_stopwords=True)
vectorizer = TfidfVectorizer(stop_words='english',min_df=5,)
lemmaTemp = []
df_l = df.copy()
df_l['full_text'] = df_l['full_text'].apply(nltk.word_tokenize)
for article in df_l["full_text"]:
    lemmaTemp.append(' '.join(lemmatizer.lemmatize(word) for word in article))
df_l['full_text'] = lemmaTemp
df_c = df.copy()
df_c['full_text'] = df_c['full_text'].apply(clean)
df_cs = df_c.copy()
df_cs['full_text'] = df_cs['full_text'].apply(nltk.word_tokenize)
stemmerTemp = []
for article in df_cs["full_text"]:
    stemmerTemp.append(' '.join(stemmer.stem(word)
                        for word in article))
df_cs['full_text'] = stemmerTemp

lemmaTemp = []
df_cl = df_c.copy()
df_cl['full_text'] = df_cl['full_text'].apply(nltk.word_tokenize)
for article in df_cl["full_text"]:
    lemmaTemp.append(' '.join(lemmatizer.lemmatize(word) for word in article))
df_cl['full_text'] = lemmaTemp

In [16]:
vectorizer = TfidfVectorizer(stop_words='english',min_df=5)
scaler = StandardScaler(with_mean=False)
lsi = TruncatedSVD(random_state=42, n_components=80)
logl1 = LogisticRegression(solver='saga', penalty='l1',
                              C=10, random_state=42, max_iter=20000)
logl2 = LogisticRegression(solver='saga', penalty='l2',
                              C=500, random_state=42, max_iter=20000)

dirty lemma 1

In [17]:
train_l, test_l = train_test_split(df_l, test_size=.2, random_state=42)

In [18]:
vect_train_l = vectorizer.fit_transform(train_l['full_text'])
vect_test_l = vectorizer.transform(test_l['full_text'])

In [19]:
red_train_l = lsi.fit_transform(vect_train_l, train_l['root_label'])
red_test_l = lsi.transform(vect_test_l)

In [20]:
logl1.fit(red_train_l, train_l['root_label'])
pred_l1 = logl1.predict(red_test_l)

In [21]:
acc, recall, precision, f1 = classification_metrics(test_l['root_label'], pred_l1)
print(f"acc: {acc}")
print(f"recall: {recall}")
print(f"precision: {precision}")
print(f"f1: {f1}")

acc: 0.9650793650793651
recall: 0.9771863117870723
precision: 0.9413919413919414
f1: 0.9589552238805971


In [23]:
logl2.fit(red_train_l, train_l['root_label'])
pred_l2 = logl2.predict(red_test_l)
acc, recall, precision, f1 = classification_metrics(test_l['root_label'], pred_l2)
print(f"acc: {acc}")
print(f"recall: {recall}")
print(f"precision: {precision}")
print(f"f1: {f1}")

acc: 0.9650793650793651
recall: 0.9771863117870723
precision: 0.9413919413919414
f1: 0.9589552238805971


clean stem 

In [24]:
train_cs, test_cs = train_test_split(df_cs, test_size=.2, random_state=42)
vect_train_cs = vectorizer.fit_transform(train_cs['full_text'])
vect_test_cs = vectorizer.transform(test_cs['full_text'])

In [25]:
red_train_cs = lsi.fit_transform(vect_train_cs, train_cs['root_label'])
red_test_cs = lsi.transform(vect_test_cs)

In [26]:
logl1.fit(red_train_cs, train_cs['root_label'])
pred_cs1 = logl1.predict(red_test_cs)
acc, recall, precision, f1 = classification_metrics(test_cs['root_label'], pred_cs1)
print(f"acc: {acc}")
print(f"recall: {recall}")
print(f"precision: {precision}")
print(f"f1: {f1}")

acc: 0.9682539682539683
recall: 0.9809885931558935
precision: 0.945054945054945
f1: 0.962686567164179


In [27]:
logl2.fit(red_train_cs, train_cs['root_label'])
pred_cs2 = logl2.predict(red_test_cs)
acc, recall, precision, f1 = classification_metrics(test_cs['root_label'], pred_cs2)
print(f"acc: {acc}")
print(f"recall: {recall}")
print(f"precision: {precision}")
print(f"f1: {f1}")

acc: 0.9666666666666667
recall: 0.9809885931558935
precision: 0.9416058394160584
f1: 0.9608938547486034


cleaned lemma

In [28]:
train_cl, test_cl = train_test_split(df_cl, test_size=.2, random_state=42)
vect_train_cl = vectorizer.fit_transform(train_cl['full_text'])
vect_test_cl = vectorizer.transform(test_cl['full_text'])

In [29]:
red_train_cl = lsi.fit_transform(vect_train_cl, train_cl['root_label'])
red_test_cl = lsi.transform(vect_test_cl)

In [30]:
logl1.fit(red_train_cl, train_cl['root_label'])
pred_cl1 = logl1.predict(red_test_cl)
acc, recall, precision, f1 = classification_metrics(test_cl['root_label'], pred_cl1)
print(f"acc: {acc}")
print(f"recall: {recall}")
print(f"precision: {precision}")
print(f"f1: {f1}")

acc: 0.9682539682539683
recall: 0.9809885931558935
precision: 0.945054945054945
f1: 0.962686567164179
