In [217]:
import re
import nltk
import os
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
nltk_data_path = "D:/nltk_data"  # Change this to your desired directory
if not os.path.exists(nltk_data_path):
    os.makedirs(nltk_data_path)

# Append the path to NLTK's data search paths
nltk.data.path.append(nltk_data_path)

# Download the required NLTK data to the custom path
nltk.download('punkt', download_dir=nltk_data_path)
nltk.download('stopwords', download_dir=nltk_data_path)
nltk.download('wordnet', download_dir=nltk_data_path)
nltk.download('omw-1.4', download_dir=nltk_data_path)
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to D:/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to D:/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to D:/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to D:/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [218]:
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

In [219]:
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

In [220]:
nltk.data.path.append("D:/nltk_data")

def tokenize_text(text):
    tokens = nltk.word_tokenize(text, language='english', preserve_line=True)
    return tokens  # Output: ['This', 'is', 'a', 'sample', 'text', 'for', 'tokenization', '.']

In [221]:
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]

In [222]:
def preprocess_text(text):
    # Step 1: Remove unwanted characters
    text = clean_text(text)
    # Step 2: Tokenize
    tokens = tokenize_text(text)
    # Step 3: Remove stop words
    tokens = remove_stopwords(tokens)
    return tokens

In [223]:
raw_text = "<p>This is an example text with HTML tags, punctuation! And special #characters?</p>"
processed_tokens = preprocess_text(raw_text)
print(processed_tokens)  

['This', 'example', 'text', 'HTML', 'tags', 'punctuation', 'And', 'special', 'characters']


In [224]:
def clean_text(text):
    """Remove HTML tags, special characters, and punctuation."""
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and punctuation
    return text

lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    """Lemmatize tokens."""
    return [lemmatizer.lemmatize(token) for token in tokens]

def tokenize_text(text):
    """Tokenize the text."""
    tokens = word_tokenize(text, language='english', preserve_line=True)
    return tokens
def handle_missing_data(text, placeholder='Missing'):
    """Handle missing or noisy data."""
    if pd.isnull(text) or text.strip() == "":
        return placeholder
    return text

def to_lowercase(tokens):
    """Convert tokens to lowercase."""
    return [token.lower() for token in tokens]

def remove_stopwords(tokens):
    """Remove stopwords."""
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]

def preprocess_text(text):
    """Complete text preprocessing pipeline."""
    text = handle_missing_data(text)  # Handle missing or noisy data
    text = clean_text(text)          # Clean text (remove unwanted characters)
    tokens = tokenize_text(text)     # Tokenize text
    tokens = to_lowercase(tokens)    # Convert to lowercase
    tokens = remove_stopwords(tokens)  # Remove stopwords
    tokens = lemmatize_tokens(tokens)  # Or stem_tokens(tokens) for stemming
    return ' '.join(tokens)      

In [225]:
df = pd.read_csv('data/2018_cleaned.csv')
df.head()

Unnamed: 0,title,categories/keyword,author_tags
0,hydrogeological parameter distribution estimat...,international journal of civil engineering and...,"['groundwater model', 'hydraulic conductivity'..."
1,applying psychic distance to services internat...,journal of asia-pacific business,"['cultural distance', 'health care services', ..."
2,estimation of aloe-emodin content in cassia gr...,indian journal of pharmaceutical sciences,"['aloe-emodin contents', 'cassia garrettiana',..."
3,anti-rice pathogenic microbial activity of per...,science and technology asia,"['antimicrobial activity', 'essential oil', 'p..."
4,line-1 orf1 protein is up-regulated by reactiv...,cancer genomics and proteomics,"['4-hne', 'bladder cancer', 'cancer progressio..."


In [None]:
tfidf_vectorizer_titles = TfidfVectorizer(max_features=10000, min_df=1, max_df=0.8)
df['title_tfidf'] = list(tfidf_vectorizer_titles.fit_transform(df['title']).toarray())

# Sentence Embeddings for abstracts
# df['tags_tfidf'] = list(tfidf_vectorizer.fit_transform(df['author_tags']).toarray())

# Save extracted features
df.to_pickle("processed_features.pkl")
# df['author_tags'].head()
# df[['title_tfidf', 'tags_tfidf']].head()
# Get the feature names (words) from the vectorizer
vocab = tfidf_vectorizer_titles.get_feature_names_out()
title_tfidf_vector = df['title_tfidf'][1]  # TF-IDF vector
nonzero_indices = [i for i, value in enumerate(title_tfidf_vector) if value != 0]

# Print words and their corresponding TF-IDF scores
for index in nonzero_indices:
    print(f"Word: {vocab[index]}, TF-IDF Score: {title_tfidf_vector[index]}")

Word: applying, TF-IDF Score: 0.3403172942934855
Word: caregiver, TF-IDF Score: 0.3179765854455264
Word: case, TF-IDF Score: 0.21998024295224758
Word: distance, TF-IDF Score: 0.3179765854455264
Word: elderlyarticle, TF-IDF Score: 0.35805011251327074
Word: internationalization, TF-IDF Score: 0.35805011251327074
Word: japanese, TF-IDF Score: 0.327735659539065
Word: psychic, TF-IDF Score: 0.35805011251327074
Word: service, TF-IDF Score: 0.27294668374665065
Word: study, TF-IDF Score: 0.18609364123438332
Word: thai, TF-IDF Score: 0.1800932172800236


In [227]:
# Apply preprocessing to specific columns (e.g., 'title' and 'abstract')
df['title'] = df['title'].apply(preprocess_text)
df['categories/keyword'] = df['categories/keyword'].apply(preprocess_text)
df['author_tags'] = df['author_tags'].apply(preprocess_text)



# Save the cleaned DataFrame
df.to_csv('data/2018_cleaned_processed.csv', index=False)

In [228]:
new_df = pd.read_csv('data/2018_cleaned_processed.csv')
new_df.head()

Unnamed: 0,title,categories/keyword,author_tags,title_tfidf
0,hydrogeological parameter distribution estimat...,international journal civil engineering techno...,groundwater model hydraulic conductivity krigi...,[0. 0. 0. ... 0. 0. 0.]
1,applying psychic distance service internationa...,journal asiapacific business,cultural distance health care service japan pe...,[0. 0. 0. ... 0. 0. 0.]
2,estimation aloeemodin content cassia grandis c...,indian journal pharmaceutical science,aloeemodin content cassia garrettiana cassia g...,[0. 0. 0. ... 0. 0. 0.]
3,antirice pathogenic microbial activity persica...,science technology asia,antimicrobial activity essential oil persicari...,[0. 0. 0. ... 0. 0. 0.]
4,line orf protein upregulated reactive oxygen s...,cancer genomics proteomics,hne bladder cancer cancer progression immunohi...,[0. 0. 0. ... 0. 0. 0.]


In [259]:
# Example pipeline for feature extraction
# TF-IDF for titles
tfidf_vectorizer_titles = TfidfVectorizer(max_features=10000, min_df=0.01, max_df=0.8) #exclude > 80% duplicate and < 1%
df['title_tfidf']  = list(tfidf_vectorizer_titles.fit_transform(df['title']).toarray())

X_tfidf = tfidf_vectorizer_titles.fit_transform(df['title']).toarray()

# Get the feature names (the words)
header = tfidf_vectorizer_titles.get_feature_names_out()

# Convert the TF-IDF array to a DataFrame
df_tfidf = pd.DataFrame(X_tfidf, columns=header)

# Now you can add this DataFrame back to your original DataFrame
df = pd.concat([df, df_tfidf], axis=1)
df = df.drop(columns=['title_tfidf'])

df.head()
# sample_tfidf = tfidf_vectorizer_titles.fit_transform()


# # Sentence Embeddings for abstracts
# # df['tags_tfidf'] = list(tfidf_vectorizer.fit_transform(df['author_tags']).toarray())

# # Save extracted features
# df.to_pickle("processed_features.pkl")
# # df['author_tags'].head()
# # df[['title_tfidf', 'tags_tfidf']].head()

# print(name_tfidf)

Unnamed: 0,title,categories/keyword,author_tags,access,acid,activity,acute,among,analysis,application,...,thailandarticle,thailandarticleopen,therapy,treatment,two,use,using,via,virus,water
0,hydrogeological parameter distribution estimat...,international journal civil engineering techno...,groundwater model hydraulic conductivity krigi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.634454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,applying psychic distance service internationa...,journal asiapacific business,cultural distance health care service japan pe...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,estimation aloeemodin content cassia grandis c...,indian journal pharmaceutical science,aloeemodin content cassia garrettiana cassia g...,0.280471,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.540294,0.0,0.0,0.0
3,antirice pathogenic microbial activity persica...,science technology asia,antimicrobial activity essential oil persicari...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,line orf protein upregulated reactive oxygen s...,cancer genomics proteomics,hne bladder cancer cancer progression immunohi...,0.20759,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [261]:
tfidf_vectorizer_titles = TfidfVectorizer(max_features=5000, min_df=0.01, max_df=0.8)
title_tfidf = tfidf_vectorizer_titles.fit_transform(df['title'])

# TF-IDF for tags
tfidf_vectorizer_tags = TfidfVectorizer(max_features=5000, min_df=0.01, max_df=0.8)
tags_tfidf = tfidf_vectorizer_tags.fit_transform(df['author_tags'])

# Optionally, get the feature names (terms)
title_feature_names = tfidf_vectorizer_titles.get_feature_names_out()
tags_feature_names = tfidf_vectorizer_tags.get_feature_names_out()

In [264]:
from scipy.sparse import hstack

# Combine the separate TF-IDF features into one feature matrix
X = hstack([title_tfidf, tags_tfidf])

# Convert the combined features to a dense format (optional, depending on model)
# X = X.toarray()  # Only if your model requires dense arrays, some models like Logistic Regression do
X.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.28047099, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])