In [1]:
import pandas as pd
import numpy as np
np.random.seed(43)

In [2]:
data = pd.read_csv("/kaggle/input/h2oai-predict-the-llm/train.csv")

In [3]:
data.head()

Unnamed: 0,Question,Response,target
0,Explain the concept of coevolution.,Coevolution is a biological process that occur...,3
1,Is it possible that recurring fever and chills...,"Yes, recurring fever and chills can be a sympt...",4
2,Evaluate the expression 3!,The expression 3! represents the factorial of ...,1
3,What are the roles of different types of RNA i...,1. Messenger RNA (mRNA): mRNA carries genetic ...,3
4,What is the role of gene flow in population ge...,Gene flow refers to the movement of individual...,3


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3976 entries, 0 to 3975
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Question  3976 non-null   object
 1   Response  3969 non-null   object
 2   target    3976 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 93.3+ KB


In [5]:
data.dropna(inplace=True)

In [6]:
data.shape

(3969, 3)

In [8]:
def spacy_tokenizer(sentence):
    if type(sentence) == float:
        sentence=sentence
    else:
        doc = nlp(sentence)
        mytokens = [ word.lemma_.lower().strip() for word in doc ]
        mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
        sentence = " ".join(mytokens)
    return sentence

In [7]:
import spacy
import string
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation

In [None]:
data['tokenized_Question'] = data['Question'].apply(spacy_tokenizer)
data['tokenized_Response'] = data['Response'].apply(spacy_tokenizer)

In [None]:
data.head()

In [None]:
x = data.iloc[:,3:]
y = data.iloc[:,2]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vec1 = CountVectorizer()
vec2 = CountVectorizer()

X_vec1 = vec1.fit_transform(x.iloc[:,0])
X_vec2 = vec2.fit_transform(x.iloc[:,1])

In [None]:
from scipy.sparse import hstack
X_vec = hstack((X_vec1, X_vec2))

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X_vec,y,test_size=0.2,stratify=y)

In [None]:
from xgboost import XGBClassifier

In [None]:
cla = XGBClassifier(n_estimators=250)

In [None]:
cla.fit(x_train,y_train)

In [None]:
y_pred = cla.predict(x_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

In [None]:
test_df = pd.read_csv('/kaggle/input/h2oai-predict-the-llm/test.csv')
test_df.head()

In [None]:
test_df['tokenized_Question'] = test_df['Question'].apply(spacy_tokenizer)
test_df['tokenized_Response'] = test_df['Response'].apply(spacy_tokenizer)

In [None]:
test_df.loc[test_df['tokenized_Question'].isnull()]

In [None]:
test_df.loc[test_df['tokenized_Response'].isnull()]

In [None]:
test_df.iloc[446,3] = 'no'
test_df.iloc[969,3] = 'no'
test_df.iloc[446,4] = 'no'
test_df.iloc[969,4] = 'no'

In [None]:
test_vec1 = vec1.transform(test_df['tokenized_Question'])
test_vec2 = vec2.transform(test_df['tokenized_Response'])

from scipy.sparse import hstack
test_vec = hstack((test_vec1, test_vec2))

In [None]:
submission_df = pd.read_csv('/kaggle/input/h2oai-predict-the-llm/sample_submission.csv')

for i in range(len(submission_df)):
    output_arr = cla.predict_proba(test_vec[i])
    submission_df.iloc[i,1] = output_arr[0][0]
    submission_df.iloc[i,2] = output_arr[0][1]
    submission_df.iloc[i,3] = output_arr[0][2]
    submission_df.iloc[i,4] = output_arr[0][3]
    submission_df.iloc[i,5] = output_arr[0][4]
    submission_df.iloc[i,6] = output_arr[0][5]
    submission_df.iloc[i,7] = output_arr[0][6]

In [None]:
submission_df.head()

In [None]:
submission_df.to_csv('submission.csv',index=False)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=10000)

X_tfidf = tfidf_vectorizer.fit_transform(data['tokenized_Question'] + ' ' + data['tokenized_Response'])


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

ngram_vectorizer = CountVectorizer(ngram_range=(1, 2))  # Let's put a small range first
X_ngrams = ngram_vectorizer.fit_transform(data['tokenized_Question'] + ' ' + data['tokenized_Response'])


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 250, 300], 
    'learning_rate': [0.01, 0.1, 0.2],  
    'max_depth': [3, 4, 5]  
}

# We create an XGBoost classifier
xgb_classifier = XGBClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(xgb_classifier, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(x_train, y_train)
print("Best Parameters:", grid_search.best_params_)

# Train XGBoost with the best hyperparameters
best_xgb_classifier = XGBClassifier(n_estimators=grid_search.best_params_['n_estimators'],
                                    learning_rate=grid_search.best_params_['learning_rate'],
                                    max_depth=grid_search.best_params_['max_depth'])
best_xgb_classifier.fit(x_train, y_train)

# Make predictions and evaluate the model
y_pred = best_xgb_classifier.predict(x_test)


In [None]:
submission_df.to_csv('submission.csv',index=False)