In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

# Set the working directory to the NLP project folder
project_dir = '/content/drive/MyDrive/nlp-project/colab_files'
os.chdir(project_dir)

print(f"Working directory: {os.getcwd()}")

# Install required packages if needed
# !pip install transformers torch pandas tqdm scikit-learn

## Training Models

Train models for Turkish (tr) and Italian (it) languages

In [None]:
# Train model for Turkish language
!python train.py --language_filter tr \
  --train_file /content/drive/MyDrive/nlp-project/public_data/train.csv \
  --eval_file /content/drive/MyDrive/nlp-project/public_data/eval.csv \
  --output_dir /content/drive/MyDrive/nlp-project/models \
  --batch_size 16 \
  --num_epochs 15

In [None]:
# Train model for Italian language
!python train.py --language_filter it \
  --train_file /content/drive/MyDrive/nlp-project/public_data/train.csv \
  --eval_file /content/drive/MyDrive/nlp-project/public_data/eval.csv \
  --output_dir /content/drive/MyDrive/nlp-project/models \
  --batch_size 16 \
  --num_epochs 15

Predictions

Generate predictions for each language and combine them

In [None]:
import pandas as pd

# Create the directory for temporary predictions
!mkdir -p /content/drive/MyDrive/nlp-project/predictions/temp_tr

# Generate predictions for Turkish language
!python predict.py \
  --test_file /content/drive/MyDrive/nlp-project/starting_kit/test_w_o_labels.csv \
  --language_filter tr \
  --model_name xlm-roberta-large \
  --model_dir /content/drive/MyDrive/nlp-project/models \
  --batch_size 16 \
  --max_length 128 \
  --output_dir /content/drive/MyDrive/nlp-project/predictions/temp_tr

# Copy the predictions to a language-specific file
!cp /content/drive/MyDrive/nlp-project/predictions/temp_tr/prediction.csv /content/drive/MyDrive/nlp-project/predictions/tr_prediction.csv

# Load the Turkish predictions
tr_predictions = pd.read_csv('/content/drive/MyDrive/nlp-project/predictions/tr_prediction.csv')

In [None]:
# Create the directory for temporary predictions
!mkdir -p /content/drive/MyDrive/nlp-project/predictions/temp_it

# Generate predictions for Italian language
!python predict.py \
  --test_file /content/drive/MyDrive/nlp-project/starting_kit/test_w_o_labels.csv \
  --language_filter it \
  --model_name xlm-roberta-large \
  --model_dir /content/drive/MyDrive/nlp-project/models \
  --batch_size 16 \
  --max_length 128 \
  --output_dir /content/drive/MyDrive/nlp-project/predictions/temp_it

# Copy the predictions to a language-specific file
!cp /content/drive/MyDrive/nlp-project/predictions/temp_it/prediction.csv /content/drive/MyDrive/nlp-project/predictions/it_prediction.csv

# Load the Italian predictions
it_predictions = pd.read_csv('/content/drive/MyDrive/nlp-project/predictions/it_prediction.csv')

In [None]:
# Combine predictions from both languages
import pandas as pd

# Load predictions from saved files to ensure they're correct
tr_predictions = pd.read_csv('/content/drive/MyDrive/nlp-project/predictions/tr_prediction.csv')
it_predictions = pd.read_csv('/content/drive/MyDrive/nlp-project/predictions/it_prediction.csv')

# Combine predictions (predictions should be filtered by language already)
combined_predictions = pd.concat([tr_predictions, it_predictions])
combined_predictions = combined_predictions.drop_duplicates(subset=['id'])

# Sort by ID
combined_predictions = combined_predictions.sort_values('id')

# Save combined predictions
combined_predictions.to_csv('/content/drive/MyDrive/nlp-project/predictions/combined_prediction.csv', index=False)

# View the first few rows of combined predictions
combined_predictions.head()