In [1]:
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from sklearn.model_selection import train_test_split




In [2]:
import pandas as pd

# Đọc trực tiếp file JSON thành DataFrame
df = pd.read_json('devset_images_metadata.json')

# Nếu cần chuyển đổi từ cột chứa danh sách các đối tượng thành các cột DataFrame
df = pd.json_normalize(df['images'])

# Hiển thị DataFrame
df.head()


Unnamed: 0,description,user_tags,title,license_name,user_nsid,image_extension_original,longitude,image_id,license_url,date_uploaded,date_taken,latitude,image_url,user_nickname,capture_device
0,,"[2009 road trip, obrero road trip]",Biltmore Estate,Attribution-NonCommercial-NoDerivs License,95156977@N00,jpg,,3519864665,http://creativecommons.org/licenses/by-nc-nd/2.0/,1242004112,2009-05-10 08:27:33.0,,http://www.flickr.com/photos/95156977@N00/3519...,5 Flip-Flops (Earl),Canon EOS DIGITAL REBEL XT
1,,"[daulatabad, daulatabad fort, ellora, road trip]",Chand Minar,Attribution-ShareAlike License,24574470@N00,jpg,75.200386,4896119055,http://creativecommons.org/licenses/by-sa/2.0/,1281931224,2010-08-14 13:35:10.0,19.939383,http://www.flickr.com/photos/24574470@N00/4896...,sankarshan,NIKON CORPORATION NIKON D90
2,"After the flood, the boarded up stores bear up...","[cedarrapids, createsouthroadtrip2009, disaste...",Uplifting Graffiti,Attribution License,73451168@N00,jpg,,3468473862,http://creativecommons.org/licenses/by/2.0/,1240493762,2009-04-21 18:07:56.0,,http://www.flickr.com/photos/73451168@N00/3468...,J Wynia,Panasonic DMC-TZ5
3,,"[cork, enchente, flood, ireland, irlanda]",DSCF6487,Attribution-NonCommercial-NoDerivs License,12947023@N00,jpg,-8.621177,4120853942,http://creativecommons.org/licenses/by-nc-nd/2.0/,1258754762,2009-11-20 15:16:40.0,51.889603,http://www.flickr.com/photos/12947023@N00/4120...,guileite,FUJIFILM FinePix S6000fd
4,,"[athens georgia, brown, current, flood, mud, r...",Oconoe river - flooded,Attribution License,60704492@N00,jpg,-83.368265,4436083254,http://creativecommons.org/licenses/by/2.0/,1268676971,2010-03-13 15:14:04.0,33.949149,http://www.flickr.com/photos/60704492@N00/4436...,The_Gut,Canon PowerShot SX10 IS


In [3]:
train_label = pd.read_csv('devset_images_gt.csv')
train_label.head()

Unnamed: 0,id,label
0,3519864665,0
1,4896119055,0
2,3468473862,0
3,4120853942,0
4,4436083254,0


In [4]:
train_label.rename(columns = {'id': 'image_id', 'label': 'train_y'}, inplace = True)
train_label.head()

Unnamed: 0,image_id,train_y
0,3519864665,0
1,4896119055,0
2,3468473862,0
3,4120853942,0
4,4436083254,0


In [5]:
data = pd.concat([df, train_label], axis = 1)
data.head()

Unnamed: 0,description,user_tags,title,license_name,user_nsid,image_extension_original,longitude,image_id,license_url,date_uploaded,date_taken,latitude,image_url,user_nickname,capture_device,image_id.1,train_y
0,,"[2009 road trip, obrero road trip]",Biltmore Estate,Attribution-NonCommercial-NoDerivs License,95156977@N00,jpg,,3519864665,http://creativecommons.org/licenses/by-nc-nd/2.0/,1242004112,2009-05-10 08:27:33.0,,http://www.flickr.com/photos/95156977@N00/3519...,5 Flip-Flops (Earl),Canon EOS DIGITAL REBEL XT,3519864665,0
1,,"[daulatabad, daulatabad fort, ellora, road trip]",Chand Minar,Attribution-ShareAlike License,24574470@N00,jpg,75.200386,4896119055,http://creativecommons.org/licenses/by-sa/2.0/,1281931224,2010-08-14 13:35:10.0,19.939383,http://www.flickr.com/photos/24574470@N00/4896...,sankarshan,NIKON CORPORATION NIKON D90,4896119055,0
2,"After the flood, the boarded up stores bear up...","[cedarrapids, createsouthroadtrip2009, disaste...",Uplifting Graffiti,Attribution License,73451168@N00,jpg,,3468473862,http://creativecommons.org/licenses/by/2.0/,1240493762,2009-04-21 18:07:56.0,,http://www.flickr.com/photos/73451168@N00/3468...,J Wynia,Panasonic DMC-TZ5,3468473862,0
3,,"[cork, enchente, flood, ireland, irlanda]",DSCF6487,Attribution-NonCommercial-NoDerivs License,12947023@N00,jpg,-8.621177,4120853942,http://creativecommons.org/licenses/by-nc-nd/2.0/,1258754762,2009-11-20 15:16:40.0,51.889603,http://www.flickr.com/photos/12947023@N00/4120...,guileite,FUJIFILM FinePix S6000fd,4120853942,0
4,,"[athens georgia, brown, current, flood, mud, r...",Oconoe river - flooded,Attribution License,60704492@N00,jpg,-83.368265,4436083254,http://creativecommons.org/licenses/by/2.0/,1268676971,2010-03-13 15:14:04.0,33.949149,http://www.flickr.com/photos/60704492@N00/4436...,The_Gut,Canon PowerShot SX10 IS,4436083254,0


In [6]:
def preprocess_user_tags(tags):
    if isinstance(tags, list):
        return ' '.join(tags)
    elif pd.isnull(tags):
        return '[NULL]'
    else:
        return tags

data['user_tags'] = data['user_tags'].apply(preprocess_user_tags)

In [7]:
data['text'] = data[['description', 'user_tags','title']].apply(lambda x: ' | '.join(x.dropna()), axis=1)
data['text']

0       2009 road trip obrero road trip | Biltmore Estate
1       daulatabad daulatabad fort ellora road trip | ...
2       After the flood, the boarded up stores bear up...
3          cork enchente flood ireland irlanda | DSCF6487
4       athens georgia brown current flood mud river s...
                              ...                        
5275    550d camino canon canoneos550d canoneoskissx4 ...
5276    albany, ny flood walk water | Albany's Corning...
5277                al the waters in pike road | IMG_4989
5278    2013 Fair Flood | 2013 county fair flood linn ...
5279    Alcatraz trip, San Francisco |  | Prison building
Name: text, Length: 5280, dtype: object

In [8]:
texts = data['text'].tolist()
labels = data['train_y'].tolist()  # Replace 'label_column_name' with the actual column name

In [9]:
train_text, val_text, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [10]:
from transformers import BertTokenizer, TFBertForSequenceClassification, create_optimizer

In [11]:
   # Adjust num_labels for your classification task
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=2)




All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def encode_texts(texts, tokenizer, max_length):
    return tokenizer(texts, max_length=max_length, truncation=True, padding='max_length', return_tensors='tf')

In [13]:
encoded_inputs = encode_texts(train_text, tokenizer, max_length=128)
dataset = tf.data.Dataset.from_tensor_slices((dict(encoded_inputs), train_labels)).shuffle(len(train_text)).batch(32)
val_inputs = encode_texts(val_text, tokenizer, max_length = 128)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_inputs), val_labels)).batch(32)

In [14]:
optimizer = Adam(learning_rate=2e-5)
loss = SparseCategoricalCrossentropy(from_logits=True)
metric = SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

AttributeError: 'Variable' object has no attribute '_distribute_strategy'

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
checkpoint = tf.keras.callbacks.ModelCheckpoint('best_model.h5', monitor='val_accuracy', save_best_only=True)

In [None]:
history = model.fit(dataset, epochs= 10, validation_data= val_dataset)  # Adjust epochs as necessary

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(val_dataset)
print(f"Loss: {loss}, Accuracy: {accuracy}")

In [None]:
test_df = pd.read_csv('test.csv')
test_df

In [None]:
# Predict on new data
import pandas as pd
import tensorflow as tf

# Load the test data
test_df = pd.read_csv('test.csv')

# Preprocess the data (handle NaN values, encode text data, etc.)
# For example, fill NaN values with a placeholder '[NULL]'
test_df['text'] = test_df[['description', 'user_tags','title']].apply(lambda x: ' | '.join(x.dropna()), axis=1)



In [None]:
test_df['text'] = test_df['text'].apply(preprocess_user_tags)
test_texts = test_df['text'].to_list()
test_texts

In [None]:
encoded_inputs = encode_texts(test_texts, tokenizer, max_length=128)
encoded_inputs


In [None]:
# Get logits
predictions = model.predict(dict(encoded_inputs))
logits = predictions.logits

# For binary classification, convert logits to binary class labels using a threshold
predicted_labels = (logits[:, 1] > 0).astype(int)

print(predicted_labels)  # Output: array of predicted class indices (0 or 1)

In [None]:
submit = pd.DataFrame({'id': test_df['image_id'], 'label': predicted_labels})
submit.head()

In [None]:
results_csv_path = 'Bret.csv'
submit.to_csv(results_csv_path, index=False)