In [2]:
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score, confusion_matrix

In [3]:
data = pd.read_csv("../../raw_data/labeled_sentences_merged_imbalanced_12k_extra_classes.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,sentence,topic_label,topic_score,sentiment_label,sentiment_score,fls_label,fls_score,sub_cat_label,sub_cat_score
0,0,Fallout from the scandal could lead to a lost ...,Governance,0.453123,Negative,0.99986,Non-specific FLS,0.937266,Business Ethics & Values,0.983647
1,1,The damning parliamentary report into the demi...,Social,0.978643,Negative,0.99971,Non-specific FLS,0.708357,Community Relations,0.77319
2,2,The BHS scandal has been described by MPs as t...,Environmental,0.931507,Negative,0.95922,Not FLS,0.990499,Pollution & Waste,0.551511
3,3,"Dominic Chappell, the businessman who bought B...",Social,0.957609,Neutral,0.999972,Not FLS,0.993661,Product Liability,0.327208
4,4,The fallout from the scandal could lead to a k...,Governance,0.408408,Negative,0.998442,Non-specific FLS,0.93579,Business Ethics & Values,0.983201


In [4]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/basmasi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/basmasi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/basmasi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/basmasi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
def combine_labels(df):
    df_copy = df.copy()
    df_copy['combined_label'] = df_copy['topic_label'] + df_copy['sentiment_label']
    return df_copy

In [6]:
df = combine_labels(pd.read_csv('../../raw_data/labeled_sentences_merged_imbalanced_12k_extra_classes.csv',usecols=['sentence','topic_label','sentiment_label']))
df.drop(['topic_label','sentiment_label'],axis=1,inplace=True)

In [7]:
df['combined_label'].value_counts()

NoneNeutral              6034
SocialNeutral            2356
EnvironmentalNeutral      867
NonePositive              766
SocialPositive            579
NoneNegative              559
SocialNegative            362
EnvironmentalPositive     304
EnvironmentalNegative     138
GovernanceNeutral         133
GovernancePositive         21
GovernanceNegative         20
Name: combined_label, dtype: int64

In [8]:
X = df[['sentence']].astype(str)
y = df.loc[:,'combined_label'].values

In [9]:
X

Unnamed: 0,sentence
0,Fallout from the scandal could lead to a lost ...
1,The damning parliamentary report into the demi...
2,The BHS scandal has been described by MPs as t...
3,"Dominic Chappell, the businessman who bought B..."
4,The fallout from the scandal could lead to a k...
...,...
12134,"""It may take a few years, but eventually the l..."
12135,(Reporting by Heekyong Yang in Seoul and Ben K...
12136,Click For Restrictions - https://agency.reuter...
12137,\n\n Users collect the needed &Charge k...


In [10]:
import string

def remove_punctuation(text):
    for punctuation in string.punctuation: 
        text = text.replace(punctuation, ' ') 
    return text

X = X.apply(remove_punctuation).astype(str)
X

Unnamed: 0,sentence
0,Fallout from the scandal could lead to a lost ...
1,The damning parliamentary report into the demi...
2,The BHS scandal has been described by MPs as t...
3,"Dominic Chappell, the businessman who bought B..."
4,The fallout from the scandal could lead to a k...
...,...
12134,"""It may take a few years, but eventually the l..."
12135,(Reporting by Heekyong Yang in Seoul and Ben K...
12136,Click For Restrictions - https://agency.reuter...
12137,\n\n Users collect the needed &Charge k...


In [11]:
X['sentence'] = X['sentence'].str.lower()
X

Unnamed: 0,sentence
0,fallout from the scandal could lead to a lost ...
1,the damning parliamentary report into the demi...
2,the bhs scandal has been described by mps as t...
3,"dominic chappell, the businessman who bought b..."
4,the fallout from the scandal could lead to a k...
...,...
12134,"""it may take a few years, but eventually the l..."
12135,(reporting by heekyong yang in seoul and ben k...
12136,click for restrictions - https://agency.reuter...
12137,\n\n users collect the needed &charge k...


In [12]:
def remove_numbers (text):
    words_only = ''.join([i for i in text if not i.isdigit()])
    return words_only

X['sentence'] = X.sentence.apply(remove_numbers)
X

Unnamed: 0,sentence
0,fallout from the scandal could lead to a lost ...
1,the damning parliamentary report into the demi...
2,the bhs scandal has been described by mps as t...
3,"dominic chappell, the businessman who bought b..."
4,the fallout from the scandal could lead to a k...
...,...
12134,"""it may take a few years, but eventually the l..."
12135,(reporting by heekyong yang in seoul and ben k...
12136,click for restrictions - https://agency.reuter...
12137,\n\n users collect the needed &charge k...


In [13]:
from nltk.corpus import stopwords 
from nltk import word_tokenize

stop_words = set(stopwords.words('english')) 

# Create function
def remove_stopwords (text):
    tokenized = word_tokenize(text)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    return without_stopwords

X['sentence'] = X.sentence.apply(remove_stopwords)
X

Unnamed: 0,sentence
0,"[fallout, scandal, could, lead, lost, knightho..."
1,"[damning, parliamentary, report, demise, bhs, ..."
2,"[bhs, scandal, described, mps, “, unacceptable..."
3,"[dominic, chappell, ,, businessman, bought, bh..."
4,"[fallout, scandal, could, lead, knighthood, re..."
...,...
12134,"[``, may, take, years, ,, eventually, law, hel..."
12135,"[(, reporting, heekyong, yang, seoul, ben, kla..."
12136,"[click, restrictions, -, https, :, //agency.re..."
12137,"[users, collect, needed, &, charge, kilometres..."


In [14]:
from nltk.stem import WordNetLemmatizer

def lemma(text):
    lemmatizer = WordNetLemmatizer() # Initiate lemmatizer
    lemmatized = [lemmatizer.lemmatize(word) for word in text] # Lemmatize
    lemmatized_string = " ".join(lemmatized)
    return lemmatized_string

X['sentence'] = X.sentence.apply(lemma)
X

Unnamed: 0,sentence
0,"fallout scandal could lead lost knighthood , c..."
1,damning parliamentary report demise bh could f...
2,bh scandal described mp “ unacceptable face ca...
3,"dominic chappell , businessman bought bh green..."
4,fallout scandal could lead knighthood removed ...
...,...
12134,"`` may take year , eventually law help make el..."
12135,( reporting heekyong yang seoul ben klayman de...
12136,click restriction - http : //agency.reuters.co...
12137,user collect needed & charge kilometre complet...


In [15]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X_bow = vectorizer.fit_transform(X.sentence)

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate

vectorizer = CountVectorizer()
naivebayes = MultinomialNB()


cv_nb = cross_validate(naivebayes,
                       X_bow,
                       y,
                       scoring = "accuracy")

round(cv_nb['test_score'].mean(),2) 

0.6