In [1]:
from pathlib import Path
from tira.rest_api_client import Client
from tira.third_party_integrations import get_output_directory
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


In [2]:
tira = Client()

    # loading train data
text_train = tira.pd.inputs(
        "nlpbuw-fsu-sose-24", "authorship-verification-train-20240408-training"
)
targets_train = tira.pd.truths(
        "nlpbuw-fsu-sose-24", "authorship-verification-train-20240408-training"
)
 # loading validation data (automatically replaced by test data when run on tira)
text_validation = tira.pd.inputs(
        "nlpbuw-fsu-sose-24", "authorship-verification-validation-20240408-training"
)
targets_validation = tira.pd.truths(
        "nlpbuw-fsu-sose-24", "authorship-verification-validation-20240408-training"
)

In [3]:
pd.set_option('display.max_colwidth', 50)
concat_train = pd.concat([text_train, targets_train['generated']], axis=1)
concat_val = pd.concat([text_validation, targets_validation['generated']], axis=1)
print(concat_train)

        id                                               text  generated
0     1253  Kamala Harris to promote 'America is back' mes...          0
1      748  Tampa Bay Buccaneers Odds & Betting Lines\n\nT...          0
2     1415  Kamala Harris Tackles Root Causes of Migration...          1
3      333  Twitter Permanently Bans President Trump's Acc...          1
4      515  Tiger Woods Hospitalized with Severe Leg Injur...          1
...    ...                                                ...        ...
1769  1412  Tom Brady and the Tampa Bay Buccaneers will de...          1
1770  1609  Schumer pledges quick delivery of $2,000 stimu...          0
1771  1079  The Gabby Petito Case: How Social Media Shaped...          1
1772  1811  "Not Guilty" Verdict Sparks Controversy: Lin W...          1
1773  1406  Coronavirus (COVID-19) Update: FDA Authorizes ...          0

[1774 rows x 3 columns]


In [4]:
concat_train.generated.value_counts()

generated
0    887
1    887
Name: count, dtype: int64

In [5]:
try:
    nlp = spacy.load('en_core_web_md')
except OSError:
    print('Downloading language model for the spaCy POS tagger')
    from spacy.cli import download
    download('en_core_web_md')
    nlp = spacy.load('en_core_web_md')

for data in (concat_train, concat_val):
    data['word2vec_doc'] = data['text'].apply(lambda text: nlp(text).vector)

In [6]:
print(data['word2vec_doc'])

0      [-2.495441, -0.3262986, -2.0509212, 0.8890234,...
1      [-2.3321953, 0.71726745, -1.2576153, 0.6445607...
2      [-2.0062313, -0.36596125, -1.4966927, 0.111295...
3      [-1.8860848, 0.8616338, -1.5257045, 0.55567193...
4      [-3.2109346, 0.78183866, -1.8826302, 1.7338719...
                             ...                        
195    [-2.095308, -0.20857623, -1.4425899, 0.5381457...
196    [-1.8967743, 0.034303647, -1.6071426, 0.351355...
197    [-2.611918, -0.35038522, -0.48586276, 0.619382...
198    [-2.2647674, 0.54415184, -1.8161175, 0.2111554...
199    [-1.5158886, 0.074764445, -0.78554136, 1.02287...
Name: word2vec_doc, Length: 200, dtype: object


In [7]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

model_logistic = LogisticRegression()

y_train = concat_train.generated
y_val = concat_val.generated

# flattening text representation column from lists into separate columns
X_train = concat_train['word2vec_doc'].apply(lambda x: pd.Series(x))
X_train.columns = X_train.columns.astype(str)
X_val = concat_val['word2vec_doc'].apply(lambda x: pd.Series(x))
X_val.columns = X_val.columns.astype(str)
        
model_logistic.fit(X_train, y_train)
        
y_pred = model_logistic.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96       887
           1       0.96      0.96      0.96       887

    accuracy                           0.96      1774
   macro avg       0.96      0.96      0.96      1774
weighted avg       0.96      0.96      0.96      1774



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
y_pred = model_logistic.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93       100
           1       0.93      0.93      0.93       100

    accuracy                           0.93       200
   macro avg       0.93      0.93      0.93       200
weighted avg       0.93      0.93      0.93       200



In [9]:
df = pd.DataFrame(y_pred)
pred_val_df = pd.concat([text_validation, df], axis=1)
pred_val_df.columns = ["id", "text", "prediction"]
print(pred_val_df)

       id                                               text  prediction
0    1023  Millions of Seniors Begin Receiving Third Stim...           0
1     165  A History of Tension: Six Justices Absent from...           1
2     331  Hundreds of Thousands of Livestock Trapped and...           0
3    1705  Grambling State University's Tiger Marching Ba...           1
4    1361  Prosecution's Cross-Examination Strategy of Ky...           1
..    ...                                                ...         ...
195  1161  Debunking the Myth: No SEAL Rescue of Children...           1
196   160  Prince Harry and Meghan Markle Honor Martin Lu...           0
197   987  World's largest container ship 'EVER ACE' succ...           0
198  1944  Colin Powell, former general and secretary of ...           0
199  1094  WHO lists additional COVID-19 vaccine for emer...           0

[200 rows x 3 columns]


In [11]:
prediction = (
        pred_val_df.set_index("id")["prediction"]
    )

# converting the prediction to the required format
prediction.name = "generated"
prediction = prediction.reset_index()

# saving the prediction
output_directory = get_output_directory(str(Path("authorship-verification-jupyter.ipynb").parent))
prediction.to_json(
        Path(output_directory) / "predictions_w2v.jsonl", orient="records", lines=True
)
