In [1]:
!pip install datasets
!pip install tqdm
!pip install spacy
!pip install matplotlib
!pip install scikit-learn
!pip install sentence_transformers
!pip install xgboost



Collecting sentence_transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence_transformers)
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting torch>=1.11.0 (from sentence_transformers)
  Downloading torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl.metadata (28 kB)
Collecting networkx (from torch>=1.11.0->sentence_transformers)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_cupti_

Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
Downloading torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl (766.6 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m766.6/766.6 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:02[0m
[?25hDownloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl (363.4 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hDownloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (13.8 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0mm eta [36m0:00:01[0m
[?25hDownloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [1]:
from tqdm import tqdm
from datasets import load_dataset
import pandas as pd
from matplotlib import pyplot as plt
import spacy
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
!spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
def get_questions(df):
    questions = []
    for row in tqdm(df.iterrows(),desc='[clean] getting questions',total=len(df)):
        text = row[1]['title']
        if row[1]['selftext'] != '' or row[1]['selftext'] is not None or row[1]['selftext'] != '[deleted]' or row[1]['selftext'] != '[removed]':
            text += ' ' + row[1]['selftext']
        questions.append(text)
    df['question'] = questions
    df = df.drop(columns=['title','selftext'])
    return df

def contains_media(text):
    """
    Checks if a given text contains any media URLs or a gif image command.

    :param text: The text to check.
    :type text: str

    :return: True if the text contains media, False otherwise.
    :rtype: bool
    """
    if re.search(r'(https?:\/\/.*\.(png|jpg|jpeg|gif|mp4|webm|avi|mov|wmv))', text):
        return True
    if re.search(r'(![gif]*)',text):
        return True
    return False

def remove_whitespace(text):
    """
    Removes all whitespace from a given string.

    :param text: The string from which to remove whitespace.
    :type text: str

    :return: The string with all whitespace removed.
    :rtype: str
    """
    return re.sub(r'[^\S ]+', '', text)

def remove_emojis(text):
    """
    Removes emojis and certain symbols from a given text string.

    This function uses a regex pattern to identify and remove a variety of emoji
    ranges and additional symbols from the input text.

    :param text: The string from which to remove emojis and symbols.
    :type text: str

    :return: The string with emojis and symbols removed.
    :rtype: str
    """
    emoji_pattern = re.compile(
        "[\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
        "\U0001F700-\U0001F77F"  # Alchemical Symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002700-\U000027BF"  # Dingbats
        "\U0001F1E6-\U0001F1FF"  # Flags (iOS)
        "\U00002500-\U00002BEF"  # Various Symbols
        "\U000024C2-\U0001F251"  # Enclosed characters
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

def get_qas(df):
    tokenized_questions = []
    tokenized_answers = []
    questions = []
    answers = []
    scores = []
    for row in tqdm(df.iterrows(),desc='[clean] getting qas',total=len(df)):
        if contains_media(row[1]['question']):
            continue

        question = remove_whitespace(row[1]['question'])
        question = remove_emojis(question)
        question = question.lower()

        if question == '' or question is None:
            continue

        doc_q = nlp(question)

        for i in range(len(row[1]['answers']['score'])):
            if contains_media(row[1]['answers']['text'][i]):
                continue

            answer = remove_whitespace(row[1]['answers']['text'][i])
            answer = remove_emojis(answer)
            answer = answer.lower()

            if answer == '' or answer is None:
                continue

            questions.append(question)
            answers.append(answer)
            scores.append(row[1]['answers']['score'][i])

            doc_a = nlp(answer)

            tokenized_questions.append([token.text for token in doc_q if not token.is_punct and not token.is_stop])
            tokenized_answers.append([token.text for token in doc_a if not token.is_punct and not token.is_stop])

    trios = pd.DataFrame({'question':questions,'answer':answers,'score':scores,'tokenized_question':tokenized_questions,'tokenized_answer':tokenized_answers})
    return trios

def normalize_scores(df,leave_min_max_out=True):
    """
    Normalizes scores based on the highest scored answer per question.

        Args:
            df (DataFrame): The given DataFrame.
            leave_min_max_out (bool): Boolean switch for the max_score column.

        Returns:
            DataFrame: the normalized DataFrame.
    """
    # max_score and min_score by question
    df['max_score'] = df.groupby('question')['score'].transform('max')

    # normalize, offset by min to guarantee the value being between 0 and 1
    df['normalized_score'] = df['score'] / df['max_score']

    if leave_min_max_out:
        df = df.drop(columns=['max_score','score'])
        df.rename(columns={'normalized_score':'score'},inplace=True)

    return df

def after_clean(df):
    new_token_answers = []
    new_token_questions = []
    for row in tqdm(df.iterrows(), desc='Cleaning some stuff up', total=len(df)):
        if type(row[1]['tokenized_answer']) is str:
            tmp_a = [remove_whitespace(remove_emojis(token[:-1])) for token in row[1]['tokenized_answer'].split(' ') if remove_whitespace(remove_emojis(token[:-1])) != '']
        else:
            tmp_a = [remove_whitespace(remove_emojis(str(token))) for token in list(row[1]['tokenized_answer']) if remove_whitespace(remove_emojis(str(token))) != '']
        if type(row[1]['tokenized_question']) is str:
            tmp_q = [remove_whitespace(remove_emojis(token[:-1])) for token in row[1]['tokenized_question'].split(' ') if remove_whitespace(remove_emojis(token[:-1])) != '']
        else:
            tmp_q = [remove_whitespace(remove_emojis(str(token))) for token in list(row[1]['tokenized_question']) if remove_whitespace(remove_emojis(str(token))) != '']
        if tmp_a[0][0] == '[':
            tmp_a[0] = tmp_a[0][1:]
        if tmp_q[0][0] == '[':
            tmp_q[0] = tmp_q[0][1:]
        new_token_answers.append(tmp_a)
        new_token_questions.append(tmp_q)
    df['tokenized_answer'] = new_token_answers
    df['tokenized_question'] = new_token_questions
    return df

def get_common_words(df):
    # may need some better solution
    no_no_list = [',','.','?','!',':','*','"','(',')','[',']','{','}','-','','“','”','\\','/','\'','\\n','\\n\\n','<','>','_','^','˘','~','|','&','…']
    new_common_words = []
    jaccard_indices = []

    for row in tqdm(df.iterrows(), desc='Getting common words between questions and answers', total=len(df)):
        tok_a = row[1]['tokenized_answer']
        tok_q = row[1]['tokenized_question']
        c = Counter()
        for token in tok_a:
            c[token] += 1
        for token in tok_q:
            c[token] += 1
        common_words = [token.strip() for token in c if c[token] > 1 and token not in no_no_list and len(token) != 0]
        new_common_words.append(common_words)
        # metszet / a + b - metszet (összes)
        jaccard_indices.append(len(common_words)/(len(tok_a)+len(tok_q)-len(common_words)))
    df['common_words'] = new_common_words
    df['jaccard_index'] = jaccard_indices
    return df

def calculate_cosine_similarity(df):
    """
    Calculates the cosine similarity between a question and its corresponding answer.

    This function uses scikit-learn's TfidfVectorizer to calculate the cosine similarity between each question and its
    corresponding answer. The resulting cosine similarities are then added as a column to the given DataFrame.

    :param df: The DataFrame containing the questions and answers for which to calculate the cosine similarity.
    :type df: pandas.DataFrame

    :return: The DataFrame with the added cosine similarity column.
    :rtype: pandas.DataFrame
    """
    df['combined_text'] = df['question'] + ' ' + df['answer']

    vectorizer = TfidfVectorizer()

    tfidf_question = vectorizer.fit_transform(df['question'])
    tfidf_answer = vectorizer.transform(df['answer'])

    cosine_similarities = [cosine_similarity(tfidf_question[i], tfidf_answer[i])[0][0] for i in tqdm(range(len(df)), desc='Calculating cosine similarity for each question - answer pair', total=len(df))]

    df['cosine_similarity'] = cosine_similarities
    df = df.drop(columns=['combined_text'])

    return df

In [4]:
ds = load_dataset("yarathealmighty/ask_science_qas")

train = ds['train'].to_pandas()
train = after_clean(train)
train = get_common_words(train)
train = calculate_cosine_similarity(train)

Cleaning some stuff up: 100%|█████████████████████████████████████████████████| 143110/143110 [00:41<00:00, 3489.91it/s]
Getting common words between questions and answers: 100%|████████████████████| 143110/143110 [00:09<00:00, 15082.33it/s]
Calculating cosine similarity for each question - answer pair: 100%|██████████| 143110/143110 [01:05<00:00, 2199.64it/s]


In [5]:
#-------------------------------------<params>-------------------------------------
TEST_SIZE = 0.10
ESTIMATORS = 251
LEARNING_RATE = 1e-1
MAX_DEPTH = 4
VECTORIZER = 'transformer'

ERROR_THRESHOLD = 0.2

In [6]:
from sentence_transformers import SentenceTransformer
vectorizer = TfidfVectorizer(max_features=100)
s_trans = SentenceTransformer('sentence-transformers/LaBSE')
def semantical_vectorziation(df):
    print('Vectorizing question, answers, and common_words...')

    questions_vectorized = s_trans.encode(df['q_t'])
    print('[sem_trans] questions encoded')
    answers_vectorized = s_trans.encode(df['a_t'])
    print('[sem_trans] answers encoded')
    common_words_vectorized = s_trans.encode(df['cw_t'])
    print('[sem_trans] common words encoded')

    df['q_v'] = list(questions_vectorized)
    df['a_v'] = list(answers_vectorized)
    df['cw_v'] = list(common_words_vectorized)

    # magic number 768, length of the vectors
    q_v_df = pd.DataFrame(df['q_v'].tolist(), columns=[f'q_v_{i}' for i in range(768)])
    a_v_df = pd.DataFrame(df['a_v'].tolist(), columns=[f'a_v_{i}' for i in range(768)])
    cw_v_df = pd.DataFrame(df['cw_v'].tolist(), columns=[f'cw_v_{i}' for i in range(768)])

    df_expanded = pd.concat([df.drop(['q_v', 'a_v', 'cw_v'], axis=1), q_v_df, a_v_df, cw_v_df], axis=1)

    print('[sem_trans] vectorization done')
    
    return df_expanded

def pre_vec_trans(df):
    q_texts = []
    a_texts = [] 
    
    cw_texts = []
    for row in tqdm(df.iterrows(),desc='Preparing questions and answers for vectorization',total=len(df)):
        q_t = " ".join(row[1]['tokenized_question'])
        a_t = " ".join(row[1]['tokenized_answer'])
        cw_t = " ".join(row[1]['common_words'])
        q_texts.append(q_t)
        a_texts.append(a_t)
        cw_texts.append(cw_t)
    df['q_t'] = q_texts
    df['a_t'] = a_texts
    df['cw_t'] = cw_texts
    return df

In [7]:
df = train

In [8]:
if VECTORIZER == 'tfidf':
  # prepare X
  X = pre_vec(df)
  df = df.drop(columns=['pre_vec_text'])

  # vectorize
  texts_tfidf = vectorizer.fit_transform(X['pre_vec_text'])

  tfidf_df = pd.DataFrame(texts_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

  X = pd.concat([X, tfidf_df], axis=1)
  X = X.drop(columns=['question','answer','tokenized_question','tokenized_answer','common_words','pre_vec_text','score'])

elif VECTORIZER == 'transformer':
  # 40 mins
  X = pre_vec_trans(df)
  df = df.drop(columns=['q_t','a_t','cw_t'])

  X = semantical_vectorziation(X)

  X = X.drop(columns=['question','answer','tokenized_question','tokenized_answer','common_words','score','q_t','a_t','cw_t'])

Preparing questions and answers for vectorization: 100%|█████████████████████| 143110/143110 [00:05<00:00, 28227.16it/s]


Vectorizing question, answers, and common_words...
[sem_trans] questions encoded
[sem_trans] answers encoded
[sem_trans] common words encoded
[sem_trans] vectorization done


In [9]:
models = []

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [11]:
y = df.score.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=420)

dummy_regressor = DummyRegressor(strategy="mean")

dummy_regressor.fit(X_train, y_train)

y_pred_dummy = dummy_regressor.predict(X_test)

mse_dummy = mean_squared_error(y_test, y_pred_dummy)
r2_dummy = r2_score(y_test, y_pred_dummy)

models.append(("Dummy Regressor", mse_dummy, r2_dummy))

model = dummy_regressor

In [12]:
display(X)

Unnamed: 0,jaccard_index,cosine_similarity,q_v_0,q_v_1,q_v_2,q_v_3,q_v_4,q_v_5,q_v_6,q_v_7,...,cw_v_758,cw_v_759,cw_v_760,cw_v_761,cw_v_762,cw_v_763,cw_v_764,cw_v_765,cw_v_766,cw_v_767
0,0.236842,0.425906,0.003726,0.014490,-0.020968,0.030967,-0.007061,0.015681,-0.007131,-0.006450,...,-0.016766,-0.034731,-0.040735,-0.039403,-0.013082,-0.013687,-0.003488,0.055771,-0.030497,-0.012886
1,0.000000,0.023450,0.003726,0.014490,-0.020968,0.030967,-0.007061,0.015681,-0.007131,-0.006450,...,-0.044307,-0.027909,-0.017733,0.026550,0.047359,0.013292,-0.009682,0.000672,0.002206,-0.052806
2,0.118056,0.164632,-0.047229,0.005434,-0.012084,-0.048676,0.006344,0.001259,0.001723,-0.011140,...,0.035083,-0.033940,-0.040902,-0.030122,-0.061132,0.066942,0.037193,0.034166,-0.000494,0.009883
3,0.190476,0.195354,0.000096,0.020942,-0.023982,0.022068,-0.018930,0.007674,-0.010145,-0.057581,...,0.000485,-0.046151,-0.066522,0.019438,0.010513,0.024222,-0.002892,0.029278,-0.030822,-0.037815
4,0.115385,0.226430,0.010969,-0.051739,-0.028209,-0.038502,-0.065362,-0.005132,-0.016302,-0.030074,...,0.043727,0.026176,0.024221,-0.059350,-0.026365,0.021729,0.016086,0.045490,-0.007015,0.066982
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143105,0.210526,0.015767,0.010093,-0.042362,-0.076562,-0.038353,-0.035103,0.004246,-0.061104,-0.020287,...,-0.014227,0.022203,-0.055968,-0.018367,-0.028072,-0.004942,-0.026958,0.054828,-0.024327,0.016394
143106,0.243902,0.138538,0.010093,-0.042362,-0.076562,-0.038353,-0.035103,0.004246,-0.061104,-0.020287,...,-0.058418,0.012413,-0.065294,-0.031392,-0.020008,0.032010,-0.004304,0.067199,-0.020683,0.007785
143107,0.145833,0.206528,0.043645,-0.001426,-0.057499,-0.023564,-0.019966,-0.008666,-0.017432,-0.028549,...,0.036254,-0.064746,-0.014138,-0.072738,-0.027966,-0.010257,-0.022825,0.034210,0.012886,0.010597
143108,0.132353,0.361925,0.043645,-0.001426,-0.057499,-0.023564,-0.019966,-0.008666,-0.017432,-0.028549,...,-0.001021,-0.036468,-0.034058,-0.069250,-0.047556,0.015701,-0.030786,0.051039,0.015898,0.017438


In [13]:
lg_params = []

In [1]:
import lightgbm as lgb
import optuna
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Split train/test data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

# Objective function for Optuna
def objective(trial):
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'num_leaves': trial.suggest_int('num_leaves', 20, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1,log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0,log=True),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0,log=True),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'device': 'gpu'
    }

    model = lgb.train(params, train_data, num_boost_round=100, valid_sets=[valid_data])
    
    y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    return rmse

# Run optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)  # Run 50 trials

# Best parameters
print("Best Parameters:", study.best_params)


NameError: name 'X' is not defined

In [None]:
lg_params_df = pd.DataFrame(lg_params)
lg_params_df.columns = ['mse','rmse','leaves','learning_rate','feature_fraction','bagging_fraction','bagging_freq','verbose','num_round']
display(lg_params_df)

In [42]:
linear_regressor = LinearRegression()

linear_regressor.fit(X_train, y_train)

y_pred_linear = linear_regressor.predict(X_test)

mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

models.append(("Linear Regression", mse_linear, r2_linear))

if mse_linear < mse_dummy:
  model = linear_regressor

In [17]:
xgb_reg = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=ESTIMATORS, learning_rate=LEARNING_RATE, max_depth=MAX_DEPTH, random_state=69)

xgb_reg.fit(X_train, y_train)

# pred
y_pred = xgb_reg.predict(X_test)
mse_xgb_reg = mean_squared_error(y_test, y_pred)
r2_xgb_reg = r2_score(y_test, y_pred)

models.append(("XGBoost_sqarederror", mse_xgb_reg, r2_xgb_reg))

if mse_xgb_reg < mse_linear:
  model = xgb_reg

In [19]:
xgb_reg = xgb.XGBRegressor(objective="reg:squaredlogerror", n_estimators=ESTIMATORS, learning_rate=LEARNING_RATE, max_depth=MAX_DEPTH, random_state=69)

xgb_reg.fit(X_train, y_train)

# pred
y_pred = xgb_reg.predict(X_test)
mse_xgb_reg2 = mean_squared_error(y_test, y_pred)
r2_xgb_reg2 = r2_score(y_test, y_pred)

models.append(("XGBoost_squaredlogerror", mse_xgb_reg2, r2_xgb_reg2))

if mse_xgb_reg2 < mse_xgb_reg:
  model = xgb_reg

In [20]:
xgb_reg = xgb.XGBRegressor(objective="reg:logistic", n_estimators=ESTIMATORS, learning_rate=LEARNING_RATE, max_depth=MAX_DEPTH, random_state=69)

xgb_reg.fit(X_train, y_train)

# pred
y_pred = xgb_reg.predict(X_test)
mse_xgb_reg3 = mean_squared_error(y_test, y_pred)
r2_xgb_reg3 = r2_score(y_test, y_pred)

models.append(("XGBoost_logistic", mse_xgb_reg3, r2_xgb_reg3))

if mse_xgb_reg3 < mse_xgb_reg2:
  model = xgb_reg

In [22]:
xgb_reg = xgb.XGBRegressor(objective="reg:pseudohubererror", n_estimators=ESTIMATORS, learning_rate=LEARNING_RATE, max_depth=MAX_DEPTH, random_state=69)

xgb_reg.fit(X_train, y_train)

# pred
y_pred = xgb_reg.predict(X_test)
mse_xgb_reg4 = mean_squared_error(y_test, y_pred)
r2_xgb_reg4 = r2_score(y_test, y_pred)

models.append(("XGBoost_pseudohubererror", mse_xgb_reg4, r2_xgb_reg4))

if mse_xgb_reg4 < mse_xgb_reg3:
  model = xgb_reg

In [24]:
xgb_reg = xgb.XGBRegressor(objective="reg:linear", n_estimators=ESTIMATORS, learning_rate=LEARNING_RATE, max_depth=MAX_DEPTH, random_state=69)

xgb_reg.fit(X_train, y_train)

# pred
y_pred = xgb_reg.predict(X_test)
mse_xgb_reg5 = mean_squared_error(y_test, y_pred)
r2_xgb_reg5 = r2_score(y_test, y_pred)

models.append(("XGBoost_linear", mse_xgb_reg5, r2_xgb_reg5))

if mse_xgb_reg4 < mse_xgb_reg5:
  model = xgb_reg



In [25]:
xgb_reg = xgb.XGBRegressor(objective="reg:gamma", n_estimators=ESTIMATORS, learning_rate=LEARNING_RATE, max_depth=MAX_DEPTH, random_state=69)

xgb_reg.fit(X_train, y_train)

# pred
y_pred = xgb_reg.predict(X_test)
mse_xgb_reg6 = mean_squared_error(y_test, y_pred)
r2_xgb_reg6 = r2_score(y_test, y_pred)

models.append(("XGBoost_gamma", mse_xgb_reg6, r2_xgb_reg6))

if mse_xgb_reg6 < mse_xgb_reg5:
  model = xgb_reg

In [27]:
xgb_reg = xgb.XGBRegressor(objective="reg:tweedie", n_estimators=ESTIMATORS, learning_rate=LEARNING_RATE, max_depth=MAX_DEPTH, random_state=69)

xgb_reg.fit(X_train, y_train)

# pred
y_pred = xgb_reg.predict(X_test)
mse_xgb_reg7 = mean_squared_error(y_test, y_pred)
r2_xgb_reg7 = r2_score(y_test, y_pred)

models.append(("XGBoost_tweedie", mse_xgb_reg7, r2_xgb_reg7))

if mse_xgb_reg7 < mse_xgb_reg6:
  model = xgb_reg

In [28]:
xgb_reg = xgb.XGBRegressor(objective="reg:absoluteerror", n_estimators=ESTIMATORS, learning_rate=LEARNING_RATE, max_depth=MAX_DEPTH, random_state=69)

xgb_reg.fit(X_train, y_train)

# pred
y_pred = xgb_reg.predict(X_test)
mse_xgb_reg8 = mean_squared_error(y_test, y_pred)
r2_xgb_reg8 = r2_score(y_test, y_pred)

models.append(("XGBoost_absoluteerror", mse_xgb_reg8, r2_xgb_reg8))

if mse_xgb_reg8 < mse_xgb_reg7:
  model = xgb_reg

In [43]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [44]:
# Ridge Regression
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)

# Lasso Regression
lasso = Lasso(alpha=0.001)
lasso.fit(X_train_scaled, y_train)

# Predictions
ridge_preds = ridge.predict(X_test_scaled)
lasso_preds = lasso.predict(X_test_scaled)

# Compute RMSE (Root Mean Squared Error)
mse_ridge = mean_squared_error(y_test, ridge_preds)
mse_lasso = mean_squared_error(y_test, lasso_preds)

# Compute R^2 (R-squared)
r2_ridge = r2_score(y_test, ridge_preds)
r2_lasso = r2_score(y_test, lasso_preds)

models.append(("Ridge Regression", mse_ridge, r2_ridge))
models.append(("Lasso Regression", mse_lasso, r2_lasso))

In [45]:
models_df = pd.DataFrame(models, columns=["Model", "MSE", "R^2"])
models_df['RMSE'] = models_df['MSE']**1/2
models_df

Unnamed: 0,Model,MSE,R^2,RMSE
0,Dummy Regressor,0.175232,-8.4e-05,0.087616
1,XGBoost_sqarederror,0.143718,0.179772,0.071859
2,Linear Regression,0.15272,0.128395,0.07636
3,XGBoost_sqarederror,0.143718,0.179772,0.071859
4,XGBoost_squaredlogerror,0.146784,0.162272,0.073392
5,XGBoost_squaredlogerror,0.146784,0.162272,0.073392
6,XGBoost_logistic,0.143544,0.180766,0.071772
7,XGBoost_pseudohubererror,0.142685,0.185668,0.071343
8,XGBoost_linear,0.143718,0.179772,0.071859
9,XGBoost_gamma,0.148518,0.152376,0.074259


In [46]:
models_df.sort_values(by=['MSE'])

Unnamed: 0,Model,MSE,R^2,RMSE
7,XGBoost_pseudohubererror,0.142685,0.185668,0.071343
6,XGBoost_logistic,0.143544,0.180766,0.071772
1,XGBoost_sqarederror,0.143718,0.179772,0.071859
3,XGBoost_sqarederror,0.143718,0.179772,0.071859
8,XGBoost_linear,0.143718,0.179772,0.071859
10,XGBoost_tweedie,0.144791,0.17365,0.072395
5,XGBoost_squaredlogerror,0.146784,0.162272,0.073392
4,XGBoost_squaredlogerror,0.146784,0.162272,0.073392
9,XGBoost_gamma,0.148518,0.152376,0.074259
13,Ridge Regression,0.152717,0.128416,0.076358


In [47]:
display(models_df['RMSE'].sort_values())

7     0.071343
6     0.071772
1     0.071859
3     0.071859
8     0.071859
10    0.072395
5     0.073392
4     0.073392
9     0.074259
13    0.076358
12    0.076360
2     0.076360
14    0.078079
11    0.080611
0     0.087616
Name: RMSE, dtype: float64

In [None]:
train.to_csv('askscience_qas.csv',index=False)
train

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
The token `dumps_write` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `dumps_write`

In [None]:
from huggingface_hub import HfApi, HfFolder
import pandas as pd

# Your Hugging Face repo
repo_id = "yarathealmighty/ask_science_qas"

# Path to save the dataframe locally before upload
df_path = "askscience_qas.csv"

# Upload the file to your Hugging Face dataset repo
api = HfApi()
api.upload_file(
    path_or_fileobj=df_path,
    path_in_repo="train.csv",
    repo_id=repo_id,
    repo_type="dataset"  # Specify this is a dataset repo
)

askscience_qas.csv:   0%|          | 0.00/271M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/yarathealmighty/ask_science_qas/commit/bfc4f148858269ede1a1ab412246a05350efe76b', commit_message='Upload train.csv with huggingface_hub', commit_description='', oid='bfc4f148858269ede1a1ab412246a05350efe76b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/yarathealmighty/ask_science_qas', endpoint='https://huggingface.co', repo_type='dataset', repo_id='yarathealmighty/ask_science_qas'), pr_revision=None, pr_num=None)