In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
from textblob import TextBlob
import advertools
import re
import spacy
import xgboost
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report,auc, 
                            roc_auc_score, precision_score,
                            recall_score,f1_score, accuracy_score)
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
import seaborn as sns
import nltk
import nltk.util
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings("ignore")
nltk.download("wordnet")
nltk.download("stopwords")

[nltk_data] Downloading package wordnet to C:\Users\Ali
[nltk_data]     Haider\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Ali
[nltk_data]     Haider\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
data_path = Path.cwd().parent/"Data"
final_data = pd.DataFrame()

In [3]:
for child in data_path.iterdir():
    for child_ch in child.iterdir():
        if child_ch.is_dir():
            files = list(child_ch.glob("**/*.csv"))
            for fls in tqdm(range(len(files))):
                df = pd.read_csv(files[fls])
        else:
            df = pd.read_csv(child_ch)
        final_data = pd.concat([df,final_data],axis = 0)

100%|████████████████████████████████████████████████████████████████████| 622/622 [00:11<00:00, 53.22it/s]
100%|████████████████████████████████████████████████████████████████████| 124/124 [00:02<00:00, 59.03it/s]
100%|████████████████████████████████████████████████████████████████████| 170/170 [00:03<00:00, 50.26it/s]
100%|████████████████████████████████████████████████████████████████████| 136/136 [00:02<00:00, 58.47it/s]
100%|██████████████████████████████████████████████████████████████████| 1703/1703 [00:31<00:00, 54.43it/s]
100%|████████████████████████████████████████████████████████████████████| 249/249 [00:04<00:00, 59.32it/s]
100%|██████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 55.66it/s]
100%|██████████████████████████████████████████████████████████████████████| 65/65 [00:01<00:00, 48.56it/s]
100%|████████████████████████████████████████████████████████████████████| 127/127 [00:02<00:00, 61.37it/s]
100%|███████████████████████

In [4]:
# final_data = final_data[final_data["class"]!="CONTROL"]
pipeline_data = final_data.copy()

In [5]:
def clean_text(text:str) -> str:
    tokens = advertools.word_tokenize(text,phrase_len=1)
    emoji_token = advertools.extract_emoji(tokens[0])
    emoji_token = emoji_token["emoji_text"]
    for tok in range(0,len(emoji_token)):
        if emoji_token[tok]:
            tokens[0][tok] = " ".join(emoji_token[tok])
    clean_text = " ".join(tokens[0])
    clean_pattern = r"@\w+|#\w+|\W+|x+|https\.\//www\.(\w+|\W+)\.com|http\w*|www\.(\w+|\W+)\.com|user\w*|\d+"
    clean_text = re.sub(clean_pattern," ",clean_text).strip()
    clean_text = re.sub(r"\s+"," ",clean_text).strip()
    return clean_text

In [6]:
def text_lemmatize(text: str) -> str:
    lemma = WordNetLemmatizer()
    tokens = advertools.word_tokenize(text,phrase_len=1)
    text = [lemma.lemmatize(tok) for tok in tokens[0]]
    return " ".join(text)

In [7]:
def remove_stopword(text: str) -> str:
    tokens = advertools.word_tokenize(text,phrase_len=1)[0]
    stop_words = stopwords.words("english")
    clean_word = [tok for tok in tokens if tok not in stop_words]
    return " ".join(clean_word)

In [8]:
def data_preprocess_pipeline(data_preprocess: pd.DataFrame,
                            filename: str,
                            ) -> pd.DataFrame:
    data_preprocess = data_preprocess[~(data_preprocess["tweet"].isnull())]
    data_preprocess = data_preprocess[["class","tweet"]]
    data_preprocess.rename(columns={"class":"Disorder"}, inplace=True)
    data_preprocess["tweet"] = data_preprocess["tweet"].apply(clean_text)
    data_preprocess["tweet"] = data_preprocess["tweet"].apply(text_lemmatize)
    data_preprocess["tweet"] = data_preprocess["tweet"].apply(remove_stopword)
    data_preprocess = data_preprocess[data_preprocess["tweet"]!=""]
    print("saving preprocessed data")
    data_preprocess.to_csv(filename,index=False)
    print("data saved")
    return data_preprocess

In [9]:
data_preprocess = data_preprocess_pipeline(pipeline_data,
                                           filename="binary_imbalanced_data_preprocess.csv",
                                           )
data_preprocess.head()

saving preprocessed data
data saved


Unnamed: 0,Disorder,tweet
0,SCHIZOPHRENIA,sally white heart white heart wont forget angel
1,SCHIZOPHRENIA,personally life split two everything psychosis...
2,SCHIZOPHRENIA,envisage sufferance look surfacing ridiculous ...
3,SCHIZOPHRENIA,take responsibility
4,SCHIZOPHRENIA,im sorry angry want slap psychologist wet mop ...


In [10]:
data_preprocess.Disorder.value_counts()

Disorder
ADHD               3018
SCHIZOPHRENIA      2959
OCD                2888
ANXIETY            2719
CONTROL            2526
PTSD               2458
DEPRESSION         2143
AUTISM             1402
EATING DISORDER     402
BIPOLAR             243
Name: count, dtype: int64