In [None]:
!pip install datasets
!pip install tqdm
!pip install spacy

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

In [None]:
from tqdm import tqdm
from datasets import load_dataset
import pandas as pd
from matplotlib import pyplot as plt
import spacy
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
def get_questions(df):
    questions = []
    for row in tqdm(df.iterrows(),desc='[clean] getting questions',total=len(df)):
        text = row[1]['title']
        if row[1]['selftext'] != '' or row[1]['selftext'] is not None or row[1]['selftext'] != '[deleted]' or row[1]['selftext'] != '[removed]':
            text += ' ' + row[1]['selftext']
        questions.append(text)
    df['question'] = questions
    df = df.drop(columns=['title','selftext'])
    return df

def contains_media(text):
    """
    Checks if a given text contains any media URLs or a gif image command.

    :param text: The text to check.
    :type text: str

    :return: True if the text contains media, False otherwise.
    :rtype: bool
    """
    if re.search(r'(https?:\/\/.*\.(png|jpg|jpeg|gif|mp4|webm|avi|mov|wmv))', text):
        return True
    if re.search(r'(![gif]*)',text):
        return True
    return False

def remove_whitespace(text):
    """
    Removes all whitespace from a given string.

    :param text: The string from which to remove whitespace.
    :type text: str

    :return: The string with all whitespace removed.
    :rtype: str
    """
    return re.sub(r'[^\S ]+', '', text)

def remove_emojis(text):
    """
    Removes emojis and certain symbols from a given text string.

    This function uses a regex pattern to identify and remove a variety of emoji
    ranges and additional symbols from the input text.

    :param text: The string from which to remove emojis and symbols.
    :type text: str

    :return: The string with emojis and symbols removed.
    :rtype: str
    """
    emoji_pattern = re.compile(
        "[\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
        "\U0001F700-\U0001F77F"  # Alchemical Symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002700-\U000027BF"  # Dingbats
        "\U0001F1E6-\U0001F1FF"  # Flags (iOS)
        "\U00002500-\U00002BEF"  # Various Symbols
        "\U000024C2-\U0001F251"  # Enclosed characters
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

def get_qas(df):
    tokenized_questions = []
    tokenized_answers = []
    questions = []
    answers = []
    scores = []
    for row in tqdm(df.iterrows(),desc='[clean] getting qas',total=len(df)):
        if contains_media(row[1]['question']):
            continue

        question = remove_whitespace(row[1]['question'])
        question = remove_emojis(question)
        question = question.lower()

        if question == '' or question is None:
            continue

        doc_q = nlp(question)

        for i in range(len(row[1]['answers']['score'])):
            if contains_media(row[1]['answers']['text'][i]):
                continue

            answer = remove_whitespace(row[1]['answers']['text'][i])
            answer = remove_emojis(answer)
            answer = answer.lower()

            if answer == '' or answer is None:
                continue

            questions.append(question)
            answers.append(answer)
            scores.append(row[1]['answers']['score'][i])

            doc_a = nlp(answer)

            tokenized_questions.append([token.text for token in doc_q if not token.is_punct and not token.is_stop])
            tokenized_answers.append([token.text for token in doc_a if not token.is_punct and not token.is_stop])

    trios = pd.DataFrame({'question':questions,'answer':answers,'score':scores,'tokenized_question':tokenized_questions,'tokenized_answer':tokenized_answers})
    return trios

def normalize_scores(df,leave_min_max_out=True):
    """
    Normalizes scores based on the highest scored answer per question.

        Args:
            df (DataFrame): The given DataFrame.
            leave_min_max_out (bool): Boolean switch for the max_score column.

        Returns:
            DataFrame: the normalized DataFrame.
    """
    # max_score and min_score by question
    df['max_score'] = df.groupby('question')['score'].transform('max')

    # normalize, offset by min to guarantee the value being between 0 and 1
    df['normalized_score'] = df['score'] / df['max_score']

    if leave_min_max_out:
        df = df.drop(columns=['max_score','score'])
        df.rename(columns={'normalized_score':'score'},inplace=True)

    return df

def after_clean(df):
    new_token_answers = []
    new_token_questions = []
    for row in tqdm(df.iterrows(), desc='Cleaning some stuff up', total=len(df)):
        if type(row[1]['tokenized_answer']) is str:
            tmp_a = [remove_whitespace(remove_emojis(token[:-1])) for token in row[1]['tokenized_answer'].split(' ') if remove_whitespace(remove_emojis(token[:-1])) != '']
        else:
            tmp_a = [remove_whitespace(remove_emojis(str(token))) for token in list(row[1]['tokenized_answer']) if remove_whitespace(remove_emojis(str(token))) != '']
        if type(row[1]['tokenized_question']) is str:
            tmp_q = [remove_whitespace(remove_emojis(token[:-1])) for token in row[1]['tokenized_question'].split(' ') if remove_whitespace(remove_emojis(token[:-1])) != '']
        else:
            tmp_q = [remove_whitespace(remove_emojis(str(token))) for token in list(row[1]['tokenized_question']) if remove_whitespace(remove_emojis(str(token))) != '']
        if tmp_a[0][0] == '[':
            tmp_a[0] = tmp_a[0][1:]
        if tmp_q[0][0] == '[':
            tmp_q[0] = tmp_q[0][1:]
        new_token_answers.append(tmp_a)
        new_token_questions.append(tmp_q)
    df['tokenized_answer'] = new_token_answers
    df['tokenized_question'] = new_token_questions
    return df

def get_common_words(df):
    # may need some better solution
    no_no_list = [',','.','?','!',':','*','"','(',')','[',']','{','}','-','','“','”','\\','/','\'','\\n','\\n\\n','<','>','_','^','˘','~','|','&','…']
    new_common_words = []
    jaccard_indices = []

    for row in tqdm(df.iterrows(), desc='Getting common words between questions and answers', total=len(df)):
        tok_a = row[1]['tokenized_answer']
        tok_q = row[1]['tokenized_question']
        c = Counter()
        for token in tok_a:
            c[token] += 1
        for token in tok_q:
            c[token] += 1
        common_words = [token.strip() for token in c if c[token] > 1 and token not in no_no_list and len(token) != 0]
        new_common_words.append(common_words)
        # metszet / a + b - metszet (összes)
        jaccard_indices.append(len(common_words)/(len(tok_a)+len(tok_q)-len(common_words)))
    df['common_words'] = new_common_words
    df['jaccard_index'] = jaccard_indices
    return df

def calculate_cosine_similarity(df):
    """
    Calculates the cosine similarity between a question and its corresponding answer.

    This function uses scikit-learn's TfidfVectorizer to calculate the cosine similarity between each question and its
    corresponding answer. The resulting cosine similarities are then added as a column to the given DataFrame.

    :param df: The DataFrame containing the questions and answers for which to calculate the cosine similarity.
    :type df: pandas.DataFrame

    :return: The DataFrame with the added cosine similarity column.
    :rtype: pandas.DataFrame
    """
    df['combined_text'] = df['question'] + ' ' + df['answer']

    vectorizer = TfidfVectorizer()

    tfidf_question = vectorizer.fit_transform(df['question'])
    tfidf_answer = vectorizer.transform(df['answer'])

    cosine_similarities = [cosine_similarity(tfidf_question[i], tfidf_answer[i])[0][0] for i in tqdm(range(len(df)), desc='Calculating cosine similarity for each question - answer pair', total=len(df))]

    df['cosine_similarity'] = cosine_similarities
    df = df.drop(columns=['combined_text'])

    return df

In [None]:
ds = load_dataset("yarathealmighty/ask_science_qas")

train = ds['train'].to_pandas()
train = after_clean(train)
train = get_common_words(train)
train = calculate_cosine_similarity(train)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


train.csv:   0%|          | 0.00/271M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/143110 [00:00<?, ? examples/s]

Cleaning some stuff up: 100%|██████████| 143110/143110 [01:29<00:00, 1592.84it/s]
Getting common words between questions and answers: 100%|██████████| 143110/143110 [00:15<00:00, 9302.03it/s]
Calculating cosine similarity for each question - answer pair: 100%|██████████| 143110/143110 [02:21<00:00, 1013.95it/s]


In [None]:
#-------------------------------------<params>-------------------------------------
TEST_SIZE = 0.10
ESTIMATORS = 251
LEARNING_RATE = 1e-1
MAX_DEPTH = 4
VECTORIZER = 'transformer'

ERROR_THRESHOLD = 0.2

In [None]:
from sentence_transformers import SentenceTransformer
vectorizer = TfidfVectorizer(max_features=100)
s_trans = SentenceTransformer('sentence-transformers/LaBSE')
def semantical_vectorziation(df):
    print('Vectorizing question, answers, and common_words...')

    questions_vectorized = s_trans.encode(df['q_t'])
    print('[sem_trans] questions encoded')
    answers_vectorized = s_trans.encode(df['a_t'])
    print('[sem_trans] answers encoded')
    common_words_vectorized = s_trans.encode(df['cw_t'])
    print('[sem_trans] common words encoded')

    df['q_v'] = list(questions_vectorized)
    df['a_v'] = list(answers_vectorized)
    df['cw_v'] = list(common_words_vectorized)

    # magic number 768, length of the vectors
    q_v_df = pd.DataFrame(df['q_v'].tolist(), columns=[f'q_v_{i}' for i in range(768)])
    a_v_df = pd.DataFrame(df['a_v'].tolist(), columns=[f'a_v_{i}' for i in range(768)])
    cw_v_df = pd.DataFrame(df['cw_v'].tolist(), columns=[f'cw_v_{i}' for i in range(768)])

    df_expanded = pd.concat([df.drop(['q_v', 'a_v', 'cw_v'], axis=1), q_v_df, a_v_df, cw_v_df], axis=1)

    return df_expanded

def pre_vec_trans(df):
    q_texts = []
    a_texts = []
    cw_texts = []
    for row in tqdm(df.iterrows(),desc='Preparing questions and answers for vectorization',total=len(df)):
        q_t = " ".join(row[1]['tokenized_question'])
        a_t = " ".join(row[1]['tokenized_answer'])
        cw_t = " ".join(row[1]['common_words'])
        q_texts.append(q_t)
        a_texts.append(a_t)
        cw_texts.append(cw_t)
    df['q_t'] = q_texts
    df['a_t'] = a_texts
    df['cw_t'] = cw_texts
    return df

In [None]:
df = train

In [None]:
if VECTORIZER == 'tfidf':
  # prepare X
  X = pre_vec(df)
  df = df.drop(columns=['pre_vec_text'])

  # vectorize
  texts_tfidf = vectorizer.fit_transform(X['pre_vec_text'])

  tfidf_df = pd.DataFrame(texts_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

  X = pd.concat([X, tfidf_df], axis=1)
  X = X.drop(columns=['question','answer','tokenized_question','tokenized_answer','common_words','pre_vec_text','score'])

elif VECTORIZER == 'transformer':
  # 40 mins
  X = pre_vec_trans(df)
  df = df.drop(columns=['q_t','a_t','cw_t'])

  X = semantical_vectorziation(X)

  X = X.drop(columns=['question','answer','tokenized_question','tokenized_answer','common_words','score','q_t','a_t','cw_t'])

Preparing questions and answers for vectorization: 100%|██████████| 143110/143110 [00:08<00:00, 16570.58it/s]


Vectorizing question, answers, and common_words...
[sem_trans] questions encoded
[sem_trans] answers encoded
[sem_trans] common words encoded


In [None]:
models = []

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [None]:
y = df.score.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=420)

dummy_regressor = DummyRegressor(strategy="mean")

dummy_regressor.fit(X_train, y_train)

y_pred_dummy = dummy_regressor.predict(X_test)

mse_dummy = mean_squared_error(y_test, y_pred_dummy)
r2_dummy = r2_score(y_test, y_pred_dummy)

models.append(("Dummy Regressor", mse_dummy, r2_dummy))

model = dummy_regressor

In [None]:
linear_regressor = LinearRegression()

linear_regressor.fit(X_train, y_train)

y_pred_linear = linear_regressor.predict(X_test)

mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

models.append(("Linear Regression", mse_linear, r2_linear))

if mse_linear < mse_dummy:
  model = linear_regressor

In [None]:
xgb_reg = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=ESTIMATORS, learning_rate=LEARNING_RATE, max_depth=MAX_DEPTH, random_state=69)

xgb_reg.fit(X_train, y_train)

# pred
y_pred = xgb_reg.predict(X_test)
mse_xgb_reg = mean_squared_error(y_test, y_pred)
r2_xgb_reg = r2_score(y_test, y_pred)

models.append(("XGBoost", mse_xgb_reg, r2_xgb_reg))

if mse_xgb_reg < mse_linear:
  model = xgb_reg

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Ridge Regression
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)

# Lasso Regression
lasso = Lasso(alpha=0.001)
lasso.fit(X_train_scaled, y_train)

# Predictions
ridge_preds = ridge.predict(X_test_scaled)
lasso_preds = lasso.predict(X_test_scaled)

# Compute RMSE (Root Mean Squared Error)
mse_ridge = mean_squared_error(y_test, ridge_preds)
mse_lasso = mean_squared_error(y_test, lasso_preds)

# Compute R^2 (R-squared)
r2_ridge = r2_score(y_test, ridge_preds)
r2_lasso = r2_score(y_test, lasso_preds)

models.append(("Ridge Regression", mse_ridge, r2_ridge))
models.append(("Lasso Regression", mse_lasso, r2_lasso))

print(f'Ridge Regression MSE: {ridge_mse:.4f}')
print(f'Lasso Regression MSE: {lasso_mse:.4f}')

In [None]:
models_df = pd.DataFrame(models, columns=["Model", "MSE", "R^2"])
models_df

In [None]:
train.to_csv('askscience_qas.csv',index=False)
train

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
The token `dumps_write` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `dumps_write`

In [None]:
from huggingface_hub import HfApi, HfFolder
import pandas as pd

# Your Hugging Face repo
repo_id = "yarathealmighty/ask_science_qas"

# Path to save the dataframe locally before upload
df_path = "askscience_qas.csv"

# Upload the file to your Hugging Face dataset repo
api = HfApi()
api.upload_file(
    path_or_fileobj=df_path,
    path_in_repo="train.csv",
    repo_id=repo_id,
    repo_type="dataset"  # Specify this is a dataset repo
)

askscience_qas.csv:   0%|          | 0.00/271M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/yarathealmighty/ask_science_qas/commit/bfc4f148858269ede1a1ab412246a05350efe76b', commit_message='Upload train.csv with huggingface_hub', commit_description='', oid='bfc4f148858269ede1a1ab412246a05350efe76b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/yarathealmighty/ask_science_qas', endpoint='https://huggingface.co', repo_type='dataset', repo_id='yarathealmighty/ask_science_qas'), pr_revision=None, pr_num=None)