In [2]:
from pathlib import Path

from joblib import dump
from tira.rest_api_client import Client
import pandas as pd
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)
from sklearn.feature_extraction.text import CountVectorizer
import re
import numpy as np

# Load the data
tira = Client()
text = tira.pd.inputs(
    "nlpbuw-fsu-sose-24", "language-identification-train-20240429-training"
)
#text = text.set_index("id")
labels = tira.pd.truths(
    "nlpbuw-fsu-sose-24", "language-identification-train-20240429-training"
)

text = text.set_index("id")
df = text.join(labels.set_index("id"))

def get_block(*ranges):
    block = []
    for r in ranges:
        r = r.split('-')
        block += list(range(int(r[0], 16), int(r[1], 16) + 1))
    return block

def comp_freq(text, block):
    encoded = np.array([ord(c) for c in text])
    return np.sum(np.isin(encoded, block)) / len(encoded)

freq_vec = np.vectorize(comp_freq, excluded={1})

def is_latin(texts):
    latin_block = get_block('0041-024F')
    freqs = freq_vec(texts, latin_block)
    return freqs > 0.5

pred_latin = is_latin(df["text"])
               
# Classify all latin-languages
text_val_latin_only = df[pred_latin]["text"]

remove_punctuation = str.maketrans('', '', r"-()\"#/@;:<>{}-=~|.?,")

def PunctFreeLower(texts):
    cleaned = []
    for text in texts:
        cleaned.append((re.sub(r"[0-9]+", "", (text.translate(remove_punctuation)))).lower())
    return cleaned

text_val_cleaned = PunctFreeLower(text_val_latin_only["text"])

vec2 = CountVectorizer(analyzer='char', ngram_range=(1, 3))
text_val_vec2 = vec2.transform(text_val_cleaned)
   
# Load the model and make predictions
model = open(Path().resolve() / "model.joblib")
predictions = model.predict(text_val_vec2)
df_pred = pd.DataFrame (predictions)

latin_predict = pd.concat([df[pred_latin], df_pred])
#df = df[["id", "language"]]

# Save the predictions
latin_predict.to_json(
    Path().resolve() / "predictions.jsonl", orient="records", lines=True
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[pred_latin]["language"] = "de" # predictions


KeyError: "None of [Index(['id', 'language'], dtype='object')] are in the [columns]"

In [1]:
from pathlib import Path

from joblib import dump
from tira.rest_api_client import Client
import pandas as pd
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)
from sklearn.feature_extraction.text import CountVectorizer
import re
import numpy as np

In [2]:
# Load the data
tira = Client()
text = tira.pd.inputs(
    "nlpbuw-fsu-sose-24", "language-identification-train-20240429-training"
)
#text = text.set_index("id")
labels = tira.pd.truths(
    "nlpbuw-fsu-sose-24", "language-identification-train-20240429-training"
)

text = text.set_index("id")
df = text.join(labels.set_index("id"))
df["pred_lang"] = pd.Series([""] * 320000, index=df.index)

In [15]:
# Split texts in latin and non-latin languages
def get_block(*ranges):
    block = []
    for r in ranges:
        r = r.split('-')
        block += list(range(int(r[0], 16), int(r[1], 16) + 1))
    return block

def comp_freq(text, block):
    encoded = np.array([ord(c) for c in text])
    return np.sum(np.isin(encoded, block)) / len(encoded)

freq_vec = np.vectorize(comp_freq, excluded={1})

def is_latin(texts):
    latin_block = get_block('0041-024F')
    freqs = freq_vec(texts, latin_block)
    return freqs > 0.5

pred_latin = is_latin(df["text"])

# Classify all non-latin-languages
text_val_non_latin = df.loc[~pred_latin, ('text')]

def is_cyrillic(texts):
    cyrillic_block = get_block('0400-04FF', '0500-052F')
    freqs = freq_vec(texts, cyrillic_block)
    return freqs > 0.5

pred_cyrillic = is_cyrillic(text_val_non_latin)

# TODO: Train NaiveBayes to distinguish russian from bulgarian
cyrillic = ~pred_latin.copy()
cyrillic[~pred_latin] = pred_cyrillic    # creating a boolean dataframe of full size, where only cyrillic languages are set to true

df.loc[cyrillic, ('pred_lang')] = "ru"

In [18]:
df
cyrillic.shape
df.loc[cyrillic].shape

(31880, 3)

In [19]:
non_latin_blocks = {'el': '0370-03FF', 'zh': '4E00-9FFF', 'ko': 'AC00-D7AF', 'ur': '0600-06FF'}

def classify_remainders(texts):
    langs = np.array(list(non_latin_blocks.keys()))
    freqs = np.empty(shape=(texts.shape[0], len(langs)))
    for i, lang in enumerate(langs):
        block = get_block(non_latin_blocks[lang])
        freqs[:, i] = freq_vec(texts, block)
    preds = langs[np.argmax(freqs, axis=1)]
    return preds

remainders = ~pred_latin.copy()
remainders[~pred_latin] = ~pred_cyrillic    # creating a boolean dataframe of full size, where only non-latin, non-cyrillic languages are set to true
text_val_remainders = df.loc[remainders, ('text')] # all non-latin, non-cyrillic texts
lang_remainders = classify_remainders(text_val_remainders)
df.loc[remainders, ('pred_lang')] = lang_remainders

(320000,) (64017, 3)
(64017,)


ValueError: Length of values (64017) does not match length of index (320000)

In [20]:
df.loc[remainders, ('pred_lang')] = lang_remainders

In [21]:
df

Unnamed: 0_level_0,text,lang,pred_lang
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Der Flughafen Berlin Brandenburg verfügt über ...,de,
2,"Успешное развитие общества, однако, возможно л...",ru,ru
3,I øvrigt er kendetegnene for en magnetisk svag...,da,
4,Sowohl über den historischen Simon als auch üb...,de,
5,"Emmure е формирана през 2003 г., когато Франки...",bg,ru
...,...,...,...
399994,Sociální náklady stejně jako soukromé náklady ...,cs,
399995,"Sljedećeg dana, glumac je malo zakasnio na set...",hr,
399996,"İstiqlal Sarayı (), Yenidən Birleşme Sarayı ()...",az,
399998,"Nella serie ""Magico Vento"" figura il personagg...",it,


In [22]:
# Classify all latin-languages
text_val_latin_only = df.loc[pred_latin, ('text')]

remove_punctuation = str.maketrans('', '', r"-()\"#/@;:<>{}-=~|.?,")

def PunctFreeLower(texts):
    cleaned = []
    for text in texts:
        cleaned.append((re.sub(r"[0-9]+", "", (text.translate(remove_punctuation)))).lower())
    return cleaned

text_val_cleaned = PunctFreeLower(text_val_latin_only)

vec2 = CountVectorizer(analyzer='char', ngram_range=(1, 3))
text_val_vec2 = vec2.fit_transform(text_val_cleaned)
   
# Load the model and make predictions
model = open(Path().resolve() / "model.joblib")
predictions = model.predict(text_val_vec2)
df.loc[pred_latin, ('pred_lang')] = predictions
df_ = df.loc[:, ('pred_lang')]

# Save the predictions
df_.to_json(
    Path().resolve() / "predictions.jsonl", orient="index", index=True
)
# df_.to_json(
#     Path().resolve() / "predictions.jsonl", orient="records", lines=True
# )

AttributeError: '_io.TextIOWrapper' object has no attribute 'predict'

: 

In [4]:
model = open(Path().resolve() / "model.joblib")
model

<_io.TextIOWrapper name='/workspaces/nlpbuw-fsu-sose-24-team-tapestry/language-identification-bayes/model.joblib' mode='r' encoding='UTF-8'>