In [294]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [327]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack

In [303]:
df_train = pd.read_csv("drive/My Drive/UST/6000H/ml4nlp-argimpact/train.csv")
df_test = pd.read_csv("drive/My Drive/UST/6000H/ml4nlp-argimpact/test.csv")
df_valid = pd.read_csv("drive/My Drive/UST/6000H/ml4nlp-argimpact/valid.csv")

In [304]:
# df_train = df_train.append(df_valid)

In [305]:
class_names = df_train.impact_label.unique().tolist()

In [306]:
df_train['text'] = df_train.apply(lambda row: eval(row.context)[-1] + ' ' + row.text, axis=1)
df_test['text'] = df_test.apply(lambda row: eval(row.context)[-1] + ' ' + row.text, axis=1)
df_valid['text'] = df_valid.apply(lambda row: eval(row.context)[-1] + ' ' + row.text, axis=1)

In [312]:
df_train['impact_label'] = df_train.impact_label.map({"NOT_IMPACTFUL": 0, "MEDIUM_IMPACT": 1, "IMPACTFUL": 2})
df_valid['impact_label'] = df_valid.impact_label.map({"NOT_IMPACTFUL": 0, "MEDIUM_IMPACT": 1, "IMPACTFUL": 2})

In [328]:
df_train['previous_argument_type'] = df_train['stance_label'].apply(lambda x: eval(x)[-1])
df_test['previous_argument_type'] = df_test['stance_label'].apply(lambda x: eval(x)[-1])
df_valid['previous_argument_type'] = df_valid['stance_label'].apply(lambda x: eval(x)[-1])

In [329]:
df_train['previous_argument_type'] = df_train.previous_argument_type.map({"OPPOSE": 0, "SUPPORT": 1})
df_test['previous_argument_type'] = df_test.previous_argument_type.map({"OPPOSE": 0, "SUPPORT": 1})
df_valid['previous_argument_type'] = df_valid.previous_argument_type.map({"OPPOSE": 0, "SUPPORT": 1})

In [None]:
X_train = hstack((X_train, df_train['previous_argument_type'].to_numpy()[:, None]))

In [339]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import metrics

In [356]:
text_clf = Pipeline([
    ('feats', FeatureUnion([
        ('vect', CountVectorizer()),
        # ('tfidf', TfidfVectorizer()),
    ])),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

In [357]:
text_clf.fit(df_train.text.to_numpy(), df_train.impact_label.to_numpy())

Pipeline(memory=None,
         steps=[('feats',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('vect',
                                                 CountVectorizer(analyzer='word',
                                                                 binary=False,
                                                                 decode_error='strict',
                                                                 dtype=<class 'numpy.int64'>,
                                                                 encoding='utf-8',
                                                                 input='content',
                                                                 lowercase=True,
                                                                 max_df=1.0,
                                                                 max_features=None,
                                                                 min_df=1,
                           

In [358]:
validation_pred = text_clf.predict(df_valid.text.to_numpy())

In [359]:
print(f"F-1 score: {metrics.f1_score(df_valid.impact_label.to_numpy(), validation_pred, average='macro')}")

F-1 score: 0.5263277824953437


In [346]:
metrics.classification_report(df_valid.impact_label.to_numpy(), validation_pred, target_names=class_names)

'               precision    recall  f1-score   support\n\nMEDIUM_IMPACT       0.41      0.29      0.34       252\n    IMPACTFUL       0.50      0.56      0.53       215\nNOT_IMPACTFUL       0.69      0.73      0.71       641\n\n     accuracy                           0.60      1108\n    macro avg       0.53      0.53      0.53      1108\n weighted avg       0.59      0.60      0.59      1108\n'

In [257]:
# # add previous argument type to the feature set
# X_train = hstack((X_train, df_train['previous_argument_type'].to_numpy()[:, None]))

In [258]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

In [280]:
# clf = MultinomialNB()
gs_clf = GridSearchCV(text_clf, {}, cv=5, n_jobs=-1, verbose=1)

In [281]:
gs_clf.fit(df_train.text.to_numpy(), df_train.impact_label.to_numpy())

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.1s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [282]:
gs_clf.cv_results_

{'mean_fit_time': array([0.35956101]),
 'mean_score_time': array([0.08494029]),
 'mean_test_score': array([0.58505849]),
 'params': [{}],
 'rank_test_score': array([1], dtype=int32),
 'split0_test_score': array([0.58519108]),
 'split1_test_score': array([0.58678344]),
 'split2_test_score': array([0.58598726]),
 'split3_test_score': array([0.58406375]),
 'split4_test_score': array([0.58326693]),
 'std_fit_time': array([0.05721224]),
 'std_score_time': array([0.01751287]),
 'std_test_score': array([0.00126924])}

In [283]:
predictions = gs_clf.predict(df_test.text.to_numpy())

In [289]:
submission = df_test[['id']]

In [290]:
submission['pred'] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [292]:
submission.to_csv('drive/My Drive/UST/6000H/bayes.csv', index=False)