In [12]:
import load_arxiv_data

from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
a = load_arxiv_data.load_arxiv_data()
a.columns

Index(['title', 'abstract', 'update_date', 'authors', 'id'], dtype='object')

In [17]:
arxiv_sample_titles = load_arxiv_data.load_arxiv_data(sample=True)['title']

In [10]:
for title in arxiv_sample_titles.sample(5):
    print(f'---{title}---\n')

---The First Chiral Homology Group---

---Neural Sequential Phrase Grounding (SeqGROUND)---

---Schwinger Boson Mean Field Theories of Spin Liquid States on Honeycomb
  Lattice: Projective Symmetry Group Analysis and Critical Field Theory---

---How Costly is Noise? Data and Disparities in Consumer Credit---

---Probabilistic Trajectory Segmentation by Means of Hierarchical Dirichlet
  Process Switching Linear Dynamical Systems---



In [16]:
# TF-IDF Vectorization, with some common hyperparameters
tfidf_hyperparameters = {
    'max_df': 0.95,
    'min_df': 2,
    'ngram_range': (1, 2),
    'stop_words': 'english'
}   

# Initialize the vectorizer
vectorizer = TfidfVectorizer(**tfidf_hyperparameters)

# Fit and transform the training data
tfidf_matrix = vectorizer.fit_transform(arxiv_sample_titles)

# Get the feature names
feature_names = vectorizer.get_feature_names_out()

# Print the shape of the matrix
print(f'The shape of the TF-IDF matrix is: {tfidf_matrix.shape}')

# Print the first 10 features
print(f'The first 10 features are: {feature_names[:10]}')

# Print the last 10 features
print(f'The last 10 features are: {feature_names[-10:]}')

# Print the first 10 feature values
print(f'The first 10 feature values are: {tfidf_matrix[0, :10].toarray()}')

# Print the last 10 feature values
print(f'The last 10 feature values are: {tfidf_matrix[0, -10:].toarray()}')

The shape of the TF-IDF matrix is: (200000, 174195)
The first 10 features are: ['00' '000' '000 000' '000 3d' '000 bar' '000 galaxies' '000 km'
 '000 sdss' '0001' '0001 surface']
The last 10 features are: ['zygmund operators' 'zygmund type' 'zynq' 'zytkow' 'zytkow object' 'zz'
 'zz ceti' 'zz gamma' 'zz production' 'zzz']
The first 10 feature values are: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
The last 10 feature values are: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
