# Importing Libraries

In [55]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.corpus import PlaintextCorpusReader
import os
import re
# Ignore Warnings
import warnings
warnings.filterwarnings('ignore')

In [73]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, recall_score, precision_score, precision_recall_curve, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn import svm

In [57]:
PATH = "Q3"

# Data Preprocessing

## Combining text files

In [59]:
# create function to append every txt file

def getTxt(path):
    main_corpus = PlaintextCorpusReader(path, '.*')

    # list of files
    corpus_list = []

    for root, dirs, filenames in os.walk(PATH):
        for f in filenames:
            if f.endswith('.txt'):
                    corpus_list.append(f)
    
     # combine sentences from all corpus into one list
    documents = []
    for corpus in corpus_list:
        corpus = main_corpus.raw(corpus)
        d = corpus.strip().split('\n')
        for s in d:
            s = s.strip()
            s = ' '.join(s.split())
            documents.append(s)
            
    return documents

In [60]:
txt_labeled = getTxt(PATH)
txt_labeled[:5]

['### abstract ###',
 'AIMX we test in the context of a dictator game the proposition that individuals may experience a self-control conflict between the temptation to act selfishly and the better judgment to act pro-socially',
 'OWNX we manipulated the likelihood that individuals would identify self-control conflict, and we measured their trait ability to implement self-control strategies',
 'OWNX our analysis reveals a positive and significant correlation between trait self-control and pro-social behavior in the treatment where we expected a relatively high likelihood of conflict identification-but not in the treatment where we expected a low likelihood',
 'OWNX the magnitude of the effect is of economic significance']

## Split labels and text

In [61]:
sentences = []
labels = []

for txt in txt_labeled:
    # check for headers containing ###
    if txt[:3] != '###':
        # get sentence
        sentence = txt[5:]
        label = txt[:4]

        sentences.append(sentence)
        labels.append(label)

print(sentences[:4])
print(labels[:4])

['we test in the context of a dictator game the proposition that individuals may experience a self-control conflict between the temptation to act selfishly and the better judgment to act pro-socially', 'we manipulated the likelihood that individuals would identify self-control conflict, and we measured their trait ability to implement self-control strategies', 'our analysis reveals a positive and significant correlation between trait self-control and pro-social behavior in the treatment where we expected a relatively high likelihood of conflict identification-but not in the treatment where we expected a low likelihood', 'the magnitude of the effect is of economic significance']
['AIMX', 'OWNX', 'OWNX', 'OWNX']


## Cleaning sentences

In [62]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/tugus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [63]:
stops = set(stopwords.words('english'))

In [64]:
# decontracting words in english so it have better meaning
def decontract(phrase):
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [65]:
def clean_sentence(sentences, stopwords):
  preprocessed_sentences = []
  for sentence in sentences:
    decontract(sentence)
    # removing extra spaces and numbers
    sentence = re.sub("\S*\d\S*", "", sentence).strip()
    # removing non alphabels
    sentence = re.sub('[^A-Za-z]+', ' ', sentence)
    # https://gist.github.com/sebleier/554280
    sentence = ' '.join(e.lower() for e in sentence.split() if e.lower() not in stopwords)
    preprocessed_sentences.append(sentence.strip())
  return preprocessed_sentences

In [66]:
sentences_processed = clean_sentence(sentences, stops)
print(sentences_processed[:4])
print(labels[:4])

['test context dictator game proposition individuals may experience self control conflict temptation act selfishly better judgment act pro socially', 'manipulated likelihood individuals would identify self control conflict measured trait ability implement self control strategies', 'analysis reveals positive significant correlation trait self control pro social behavior treatment expected relatively high likelihood conflict identification treatment expected low likelihood', 'magnitude effect economic significance']
['AIMX', 'OWNX', 'OWNX', 'OWNX']


In [67]:
X = sentences
y = labels

# Modeling

In [68]:
def classif(x, y, estimator):
    y_pred = estimator.predict(x)
    print(classification_report(y, y_pred))

In [69]:
vectorizer = TfidfVectorizer()

In [70]:
X_vector = vectorizer.fit_transform(X)
X_vector = X_vector.toarray()
X_vector.shape

(3117, 4010)

In [74]:
# Split the labeled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vector, y, test_size=0.2, random_state=0)

In [75]:
classifierSVM = svm.SVC()
classifierSVM.fit(X_train, y_train)

SVC()

In [76]:
classif(X_test, y_test, classifierSVM)

              precision    recall  f1-score   support

        AIMX       0.78      0.60      0.68        42
        BASE       0.50      0.14      0.22         7
        CONT       0.65      0.33      0.44        33
        MISC       0.88      0.95      0.92       371
        OWNX       0.82      0.83      0.82       171

    accuracy                           0.85       624
   macro avg       0.73      0.57      0.62       624
weighted avg       0.84      0.85      0.84       624



In [77]:
classifierRF = RandomForestClassifier()
classifierRF.fit(X_train, y_train)

RandomForestClassifier()

In [78]:
classif(X_test, y_test, classifierRF)

              precision    recall  f1-score   support

        AIMX       0.71      0.64      0.67        42
        BASE       0.67      0.86      0.75         7
        CONT       0.52      0.42      0.47        33
        MISC       0.90      0.91      0.91       371
        OWNX       0.79      0.81      0.80       171

    accuracy                           0.84       624
   macro avg       0.72      0.73      0.72       624
weighted avg       0.84      0.84      0.84       624



In [79]:
classifierDT = DecisionTreeClassifier()
classifierDT.fit(X_train, y_train)

DecisionTreeClassifier()

In [80]:
classif(X_test, y_test, classifierDT)

              precision    recall  f1-score   support

        AIMX       0.66      0.69      0.67        42
        BASE       0.43      0.86      0.57         7
        CONT       0.40      0.52      0.45        33
        MISC       0.90      0.88      0.89       371
        OWNX       0.81      0.75      0.78       171

    accuracy                           0.82       624
   macro avg       0.64      0.74      0.67       624
weighted avg       0.83      0.82      0.82       624

