# Step 0: Importing Required Packages

In [28]:
import pandas as pd
import gensim
from scipy import stats

# Step 1: Data Loading

In [29]:
news_df = pd.read_csv("reddit_worldnews.csv")

# Step 2: Data Preprocessing

In [30]:
def preprocess_df(df: pd.Series) -> pd.DataFrame:
    """
    Preprocess the input DataFrame.

    This function should perform text preprocessing task which may include:
      - Converting text to lowercase
      - Removing punctuations and whitespaces
      - Removing stopwords
      - Stemming or Lemmatizing
      - (Other preprocessing steps as you wish)

    Parameters:
        df (pd.DataFrame): A DataFrame containing at least a column

    Returns:
        pd.DataFrame: The DataFrame with the preprocessed column.
    """

    # TODO: Implement the preprocessing logic
    # df = df.str.lower()  # Convert to lowercase
    # You can extend this with additional preprocessing steps.

    # Example (simple preprocessing):
    df = df.apply(gensim.utils.simple_preprocess)

    return df

processed_df = preprocess_df(news_df['title'])

# Step 3: Building the Model

**Parameters:**

**vector_size** = (int) - Dimensionality of the feature vectors.

**alpha** = (float) - The initial learning rate

**window** = (int) - The maximum distance between the current and predicted word within a sentence.

**min_count** = (int) - Ignores all words with total frequency lower than this.

**epochs** = (int) - Number of iterations over the whole dataset


more info: https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial#Training-the-model

In [11]:
model = gensim.models.Word2Vec(
        window=******,
        vector_size=******,
        alpha=******,
        min_count=******,
        epochs=******
        )

model.build_vocab(processed_df)

# Step 4: Training the Model

In [None]:
model.train(processed_df, total_examples=model.corpus_count, epochs=model.epochs)

# Step 5: Testing the Model

In [None]:
df_test = pd.read_csv("wordsim353crowd.csv")
df_test['Human (Mean)']/=df_test['Human (Mean)'].max()
predictions = []
gt_list = []
missed_pairs = 0
for row in df_test.iterrows():
  try:
    model_output = model.wv.similarity(w1=row[1]['Word 1'], w2=row[1]['Word 2'])
    predictions.append(model_output)
    gt_list.append(row[1]['Human (Mean)'])
  except:
    missed_pairs+=1
spearmanr_score = stats.spearmanr(predictions, gt_list)
print("___** FINAL RESULTS **___\n")
print(f'missed_pairs: {missed_pairs}')
print(f'spearmanr_score: {spearmanr_score.statistic}')