In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from tqdm import tqdm
import os
import sys
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.metrics import classification_report as report
from sklearn.feature_extraction.text import CountVectorizer
import argparse
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer

In [3]:
def make_dataframe_st3(input_folder, labels_fn=None):
    #MAKE TXT DATAFRAME
    text = []
    for fil in tqdm(filter(lambda x: x.endswith('.txt'), os.listdir(input_folder))):
        iD = fil[7:].split('.')[0]
        lines = list(enumerate(open(input_folder + '/' + fil,'r',encoding='utf-8').read().splitlines(),1))
        text.extend([(iD,) + line for line in lines])

    df_text = pd.DataFrame(text, columns=['id','line','text'])
    df_text.id = df_text.id.apply(int)
    df_text.line = df_text.line.apply(int)
    df_text = df_text[df_text.text.str.strip().str.len() > 0].copy()
    df_text = df_text.set_index(['id','line'])
    
    df = df_text

    if labels_fn:
        #MAKE LABEL DATAFRAME
        labels = pd.read_csv(labels_fn,sep='\t',encoding='utf-8',header=None)
        labels = labels.rename(columns={0:'id',1:'line',2:'labels'})
        labels = labels.set_index(['id','line'])
        labels = labels[labels.labels.notna()].copy()

        #JOIN
        df = labels.join(df_text)[['text','labels']]

    return df

In [4]:
folder_train = "/content/drive/MyDrive/CS546/course-project/SemEval/data/data_en_subtask3/train-articles-subtask-3"
folder_dev = "/content/drive/MyDrive/CS546/course-project/SemEval/data/data_en_subtask3/dev-articles-subtask-3"
labels_train_fn = "/content/drive/MyDrive/CS546/course-project/SemEval/data/data_en_subtask3/train-labels-subtask-3.txt"

#Read Data
print('Loading dataset...')
train = make_dataframe_st3(folder_train, labels_train_fn)
test = make_dataframe_st3(folder_dev)

Loading dataset...


446it [00:03, 132.50it/s]
90it [00:21,  4.23it/s]


In [5]:
X_train = train['text'].values
Y_train = train['labels'].fillna('').str.split(',').values

X_test = test['text'].values

multibin= MultiLabelBinarizer() #use sklearn binarizer

Y_train = multibin.fit_transform(Y_train)
#Create train-test split

pipe = Pipeline([('vectorizer',CountVectorizer(ngram_range = (1, 2), 
                                            analyzer='word')),
            ('SVM_multiclass', MultiOutputClassifier(svm.SVC(class_weight= None,C=1, kernel='linear'),n_jobs=1))])

print('Fitting SVM...')
pipe.fit(X_train,Y_train)

print('In-sample Acc: \t\t', pipe.score(X_train,Y_train))

Y_pred = pipe.predict(X_test)
out = multibin.inverse_transform(Y_pred)
out = list(map(lambda x: ','.join(x), out))
out = pd.DataFrame(out, test.index)
# out.to_csv(out_fn, sep='\t', header=None)
# print('Results on: ', out_fn)

Fitting SVM...
In-sample Acc: 		 0.9984042553191489


In [6]:
out

Unnamed: 0_level_0,Unnamed: 1_level_0,0
id,line,Unnamed: 2_level_1
820791520,1,
820791520,3,Loaded_Language
820791520,5,Loaded_Language
820791520,6,
820791520,7,Loaded_Language
...,...,...
813953273,43,
813953273,44,Loaded_Language
813953273,45,"Doubt,Loaded_Language"
813953273,46,


In [None]:
train # paragraph-level multi-label

Unnamed: 0_level_0,Unnamed: 1_level_0,text,labels
id,line,Unnamed: 2_level_1,Unnamed: 3_level_1
111111111,3,Geneva - The World Health Organisation chief o...,Doubt
111111111,5,"""The next transmission could be more pronounce...",Appeal_to_Authority
111111111,13,"But Tedros voiced alarm that ""plague in Madaga...",Repetition
111111111,17,He also pointed to the presence of the pneumon...,Appeal_to_Fear-Prejudice
111111111,19,He praised the rapid response from WHO and Mad...,Appeal_to_Fear-Prejudice
...,...,...,...
999001970,4,Also the Left killed comedy. This is what its ...,"Exaggeration-Minimisation,Slogans"
999001970,5,Saturday Night Live writer and comedian Nimesh...,Exaggeration-Minimisation
999001970,6,That's what Columbia snowflakes thought was of...,Name_Calling-Labeling
999001970,8,"Comrades, these jokes you have been listening ...","Exaggeration-Minimisation,Name_Calling-Labeling"


In [None]:
test

Unnamed: 0_level_0,Unnamed: 1_level_0,text
id,line,Unnamed: 2_level_1
820791520,1,George III Lost America.
820791520,3,Theresa May Could Lose the United Kingdom Over...
820791520,5,Britain is locked in the most serious peacetim...
820791520,6,Brexit has shown the world a British parliamen...
820791520,7,One veteran of Margaret Thatcher’s cabinet sai...
...,...,...
813953273,43,Rough sleepers and aggressive beggars are a pe...
813953273,44,"Yet while innocent blood runs in the gutters, ..."
813953273,45,And instead of being able to celebrate an opti...
813953273,46,"The first of many more to come, no doubt."


In [None]:
train.to_csv("/content/drive/MyDrive/CS546/course-project/SemEval/data/data_en_subtask3/train.csv")
test.to_csv("/content/drive/MyDrive/CS546/course-project/SemEval/data/data_en_subtask3/test.csv")