<a href="https://colab.research.google.com/github/y4c6/master_thesis/blob/main/EJMR_toMatrix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this file, i collect the codes for **combining the json** and **tokenizing**.

In [None]:
from google.colab import drive
# directory
drive.mount('/content/gdrive')

In [None]:
import numpy as np
import pandas as pd
import json

## Prepare jsonfiles

In [None]:
import glob
import json

def list_json_files(path, start_with): 
    # get a list of json files that starts with specific word 
    json_files = glob.glob(path + f'/{start_with}*.json')
    return json_files

def concat_json_files(file_paths):
    # concatenate the content of all the files in the list
    data = {'topic':[], 'posts':[]}
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            file_data = json.load(f)
            data['topic'].extend(file_data['topic'])
            data['posts'].extend(file_data['posts'])
    return data

def concat_json_files_with_start(path, start_with):
    json_files = list_json_files(path, start_with)
    data = concat_json_files(json_files)
    return data

In [None]:
path = '/content/gdrive/MyDrive/論文相關材料/'
start_with = 'EJMRpost_'
data = concat_json_files_with_start(path, start_with)

In [None]:
len(data['topic'])

In [None]:
## construct the dictionary to dataframe
df = pd.DataFrame(data)
df.head(3)

In [None]:
#df.to_pickle("./ejmr_20.pkl")

In [None]:
#unpickled_df = pd.read_pickle("./ejmr_20.pkl")  
unpickled_df.head(3)

## select out the asian-related posts

In [None]:
asia_target = ['asian', 'asia', 'korea', 'korean', 'japan', 'japanese', 'taiwan', 'taiwanese', 'east', 'hongkong']
china_target = ['china', 'chinese']

In [None]:
# Defining all the conditions inside a function
def condition(x):
    if any( word in x for word in asia_target): #series.str.contains('Mel').any()
        return "asia"
    elif any( word in x for word in china_target):
        return "china"
    else:
        return "other"
 
# Applying the conditions
df['Target'] = df['topic'].apply(condition)
df.head(3)

## Prepare `y`

In [None]:
df = df.drop(df[df['Target'] == 'other'].index)
df.head(3)

In [None]:
df['y'] = df['Target'].apply(lambda x: 1 if x == 'china' else 0)
df.head(3)

## Define functions for preprocessing

In [None]:
import nltk
nltk.download('punkt') #this is download for tonkenizer
nltk.download('stopwords')

In [None]:
import string
import re
from langdetect import detect
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Define a function to preprocess text
def preprocess_text(text):

  # Detect the language of the text
  language = detect(text)
  # If the text is not in English, return an empty string
  if language != 'en':
    return ''

  # Lowercase all characters
  text = text.lower()

  # Remove URLs
  text = re.sub(r'https?://\S+', '', text)

  # Remove digits
  text = text.translate(str.maketrans('', '', string.digits))

  # Remove punctuation
  text = text.translate(str.maketrans('', '', string.punctuation))

  return text


# Define a function to tokenize and remove stopwords from text and stemmer
def stemmer_tokenize_and_remove_stopwords(text):
  # Tokenize the article
  text_tokens = word_tokenize(text)

  # Load English stopwords
  stop_words = set(stopwords.words('english'))

  # Remove stopwords
  tokens = [token for token in text_tokens if token not in stop_words]

  # Stem the tokens
  stemmer = PorterStemmer()
  tokens = [stemmer.stem(token) for token in tokens]

  return tokens


# Define a function to tokenize and remove stopwords from text and lemmatize
def lemmatizer_tokenize_and_remove_stopwords(text):
  # Tokenize the article
  text_tokens = word_tokenize(text)

  # Load English stopwords
  stop_words = set(stopwords.words('english'))

  # Remove stopwords
  tokens = [token for token in text_tokens if token not in stop_words]

  # Lemmatize the tokens
  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(token) for token in tokens]

  return tokens


## Prepare `X`

In [None]:
## check library
import gensim

## ignore warnings
import warnings
warnings.filterwarnings('ignore')

# # if you want to see the training messages, you can use it
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## the input type
df['text_tokenized'] = df['posts'].apply(lambda x: tokenize_and_remove_stopwords(preprocess_text( ' '.join(x) ))) 
df[['y', 'text_tokenized']].head(3)

In [None]:
df.to_pickle("/content/gdrive/MyDrive/Thesis_Data&Result/ejmr_20_token.pkl")