In [1]:
import dask.dataframe as ddf
import pandas as pd
import numpy as np
import spacy as sp
import time, re, os

from datetime import datetime

# !pip install pysentimiento
# from pysentimiento.preprocessing import preprocess_tweet

import warnings
warnings.filterwarnings('ignore')



In [2]:
# !pip install pysentimiento
from pysentimiento.preprocessing import preprocess_tweet

In [15]:
# !pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
!python3 -m spacy download en_core_web_sm  # can't find model en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.7.1
[0m  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 2.2.0
    Uninstalling en-core-web-sm-2.2.0:
      Successfully uninstalled en-core-web-sm-2.2.0
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
n_partitions = 6

nlp = sp.load('en_core_web_sm',disable=['parser','ner','textcat'])
nlp.add_pipe('sentencizer') # sentence separation

STOPWORDS_ENG = set(['a', 'about', 'above', 'after', 'again', 'against', 'ah', 'ai', 'ain', 'ain"t', 'aint', 'all', 'am',
						'amp', 'an', 'and', 'any', 'are', 'aren', 'aren"t', 'arent', 'as', 'at', 'b', 'bc', 'be', 'because',
						'been',	'before', 'being', 'below', 'between', 'both', 'but', 'by', 'c', 'can', 'couldn', 'couldn"t',
						'couldnt', 'd', 'did', 'didn', 'didn"t', 'didnt', 'do', 'does', 'doesn', 'doesn"t', 'doesnt',
						'doing', 'don', 'don"t', 'dont', 'down',
						'during', 'e', 'each', 'f', 'few', 'for', 'from', 'ft', 'further', 'g', 'get', 'getta', 'gon',
						'gonna', 'h', 'had', 'hadn', 'hadn"t', 'hadnt', 'has', 'hasn', 'hasn"t', 'hasnt', 'have', 'haven',
						'haven"t', 'havent',
						'having', 'here', 'how', 'if', 'in', 'into', 'is',
						'isn', 'isn"t', 'isnt', 'it"s', 'j', 'just', 'k', 'l', 'll', 'lt', 'm', 'ma',
						'mightn', 'mightn"t', 'mightnt', 'more', 'most', 'mustn', 'mustn"t', 'n',
						'na', 'needn', 'needn"t', 'neednt',
						'nor', 'now', 'o', 'of', 'off', 'oh', 'on', 'once', 'only', 'or', 'other', 'out', 'over', 'own',
						'p', 'q', 'r', 're', 'rn', 'rt', 's', 'same', 'shan', 'shan"t', 'shant', 'she"s', 'shes',
						'should', 'should"ve', 'shouldve', 'shouldn', 'shouldn"t', 'shouldnt', 'so', 'some', 'such',
						't', 'ta', 'than', 'that', 'that"ll', 'thatll',
						'the', 'then', 'there', 'these', 'this', 'those', 'through', 'to', 'too', 'under', 'until',
						'u', 'up', 'ur', 'v', 've', 'very', 'vs', 'w', 'was', 'wasn', 'wasn"t', 'wasnt', 'were',
						'weren', 'weren"t', 'werent', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why',
						'will', 'with', 'won', 'won"t', 'wont', 'wouldn', 'wouldn"t', 'wouldnt', 'x', 'y', 'yall',
						'you"d', 'youd', 'you"ll', 'youll', 'you"re', 'youre', 'you"ve', 'youve' 'z'])



In [4]:
# Remove stop words
def removeStopwords(text):
  text = ' '.join([word for word in text.split() if word not in STOPWORDS_ENG])
  return re.sub(' +',' ',text).strip()

text = 'I am a student'
print(text)
removeStopwords(text)

I am a student


'I student'

In [5]:
# Remove punctuation
def removePunctuation(text):
  punctuationStr= r'[_━🇧.▪"\[!"#\$%&\(\)\*\+,-\./:;<=>\?\^`{\|}~¿¡¬‘’£¥€¢₩°«»“”— ´¨¸•¤‹›–…·\]]'
  text = re.sub(punctuationStr,' ',text)
  return re.sub(' +',' ',text).strip()

text = 'I am a student.%@(())'
print(text)
removePunctuation(text)

I am a student.%@(())


'I am a student @'

In [6]:
# Preprocess tweet
def helper_preprocess(text,demojiFlag):
  return preprocess_tweet(text,lang='en', user_token='@usuario',
                          url_token='url',preprocess_hashtags=True,
                          hashtag_token=None,demoji=demojiFlag,
                          shorten=3, normalize_laughter=True,
                          emoji_wrapper='emoji')


In [7]:
# Lemmatization and POS tag
def lemmatizeAndPOStagText(text):

  doc = nlp(text)

  lista_lemmatized= []
  lista_postags_text=[]

  for token in doc:
    lista_lemmatized.append(token.lemma_)
    lista_postags_text.append(f'{token.lemma_}_{token.pos_}')

  text1 = ' '.join(lista_lemmatized).strip()
  text2 = ' '.join(lista_postags_text).strip()

  return text1, text2

text = 'I am a student'
print(text)
lemmatizeAndPOStagText(text)

I am a student


('I be a student', 'I_PRON be_AUX a_DET student_NOUN')

In [8]:
def processText(allText, demojiFlag):
  #Remove extra newlines
  allText = [re.sub(r'[\r|\n|\r\n]+',' ',t) for t in allText]

  #Remove extra whitespace
  allText = [re.sub(' +',' ',t).strip() for t in allText]

  #Replace symbols (eg. I’m --> I'm   that´s --> that's)
  allText = [re.sub('’', '\'', t) for t in allText]
  allText = [re.sub('”', '\'', t) for t in allText]
  allText = [re.sub('´', '\'', t) for t in allText]
  allText = [re.sub('"', '\'', t) for t in allText]

  allText = [re.sub('‑', '-', t) for t in allText]
  allText = [re.sub('—', '-', t) for t in allText]

  #Preprocess tweet using pysentimiento
  allText = [helper_preprocess(t, demojiFlag) for t in allText]

  allText = [removePunctuation(t) for t in allText]

  # Lowercase
  allText = [t.lower() for t in allText]

  return allText

In [9]:
def cleanProcessDataframe(df):

  clean_tweets = processText(df['tweet'].values, demojiFlag=False)

  result1 = []
  result2 = []
  for t in clean_tweets:
    lst1, lst2 = lemmatizeAndPOStagText(t)
    result1.append(lst1)
    result2.append(lst2)

  df['clean_tweet_lemma'] = result1
  df['clean_tweet_lemma_postags'] = result2

  clean_tweets_nostop = [removeStopwords(t) for t in clean_tweets]

  result1 = []
  result2 = []
  for t in clean_tweets_nostop:
    lst1, lst2 = lemmatizeAndPOStagText(t)
    result1.append(lst1)
    result2.append(lst2)

  df['clean_tweet_nostop_lemma'] = result1
  df['clean_tweet_nostop_lemma_postags'] = result2

  return df

In [10]:
def convertNum(value):
  return 0 if (value == np.nan) else value


def convertText(value):
  return '' if (value == np.nan) else value


In [11]:
df = pd.read_csv("/Users/jianhongxu/python_project/twitter_dataset/Timelines/English/Adhd_eng/usuario_15000.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   class                 80 non-null     object
 1   tweet_id              80 non-null     int64 
 2   day                   80 non-null     object
 3   time                  80 non-null     object
 4   tweet                 80 non-null     object
 5   tweet_favorite_count  80 non-null     int64 
 6   tweet_retweet_count   80 non-null     int64 
 7   tweet_source          80 non-null     object
 8   user_id               80 non-null     int64 
 9   user_followers_count  80 non-null     int64 
 10  user_friends_count    80 non-null     int64 
 11  user_listed_count     80 non-null     int64 
 12  user_statuses_count   80 non-null     int64 
dtypes: int64(8), object(5)
memory usage: 8.2+ KB


In [19]:
def preprocessingData(fileName):
  timelineDirectory = f'/Users/jianhongxu/python_project/twitter_dataset/Timelines/English/{fileName}'

  cleanUsersDirectory = f'/Users/jianhongxu/python_project/twitter_dataset/Timelines/English/clean_users/{fileName}'
  if not os.path.exists(cleanUsersDirectory):
    os.makedirs(cleanUsersDirectory)

  print(f'*** {timelineDirectory} ***')

  usuarios = os.listdir(timelineDirectory)
  print(f'Number of users: {len(usuarios)}')

  for count, user in enumerate(usuarios):

    # print(f'\n** {user} ***\n')
    # print(f'{datetime.now().strftime("%d-%m-%Y %H:%M:%S")}')

    # print('Start...')

    # start_time = time.time()

    df = pd.read_csv(os.path.join(timelineDirectory,user),
                     low_memory=True,
                     converters={'tweet':convertText,'tweet_favorite_count':convertNum,
                                 'tweet_retweet_count':convertNum},
                     dtype={'tweet_id':str,'user_id':str})
    df['clean_tweet_lemma'] = ''
    df['clean_tweet_lemma_postags'] = ''

    df['clean_tweet_nostop_lemma'] = ''
    df['clean_tweet_nostop_lemma_postags'] = ''

    dask_dataframe = ddf.from_pandas(df,npartitions=n_partitions)

    # print(df.shape)
    # print(f'df: {df.columns}')
    # print(f'dask_dataframe: {dask_dataframe.columns}')
    result = dask_dataframe.map_partitions(cleanProcessDataframe, meta=df)
    df = result.compute()

    cleanData = df[['class','tweet_id','day','time',
                    'tweet',
                    'clean_tweet_lemma','clean_tweet_lemma_postags',
                    'clean_tweet_nostop_lemma','clean_tweet_nostop_lemma_postags',
                    'tweet_favorite_count','tweet_retweet_count',
                    'tweet_source',
                    'user_id',
                    'user_followers_count','user_friends_count',]]
    cleanData = cleanData[cleanData['clean_tweet_lemma'] != '']

    # print(cleanData.shape)

    cleanData.to_csv(os.path.join(cleanUsersDirectory, f'user_{user}'), index = False)

    # end_time = time.time()
    # print(f'Time: {(end_time - start_time) / 60.0}')

    if(count + 1) % 100 == 0:
      print(f'Processing {count}/{len(usuarios)},{datetime.now().strftime("%d-%m-%Y %H:%M:%S")}')


# preprocessingData('Adhd_eng')
#Number of users: 622
# Processing 99/622,28-08-2024 20:26:52
# Processing 199/622,28-08-2024 20:50:31
# Processing 299/622,28-08-2024 21:13:32
# Processing 399/622,28-08-2024 21:32:43
# Processing 499/622,28-08-2024 21:48:13
# Processing 599/622,28-08-2024 22:04:26

# preprocessingData('Bipolar_eng')
# Number of users: 136
# 22m 43.2s

# preprocessingData('Control_eng')
# 298m 41.3s
# Number of users: 1703

preprocessingData('Depression_eng')
# 39m 35.3s
# Number of users: 249

*** /Users/jianhongxu/python_project/twitter_dataset/Timelines/English/Depression_eng ***
Number of users: 249
Processing 99/249,29-08-2024 10:43:00
Processing 199/249,29-08-2024 10:58:45


In [3]:
df = pd.read_csv("/Users/jianhongxu/python_project/twitter_dataset/Timelines/English/clean_users/Adhd_eng/user_usuario_15000.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 15 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   class                             80 non-null     object
 1   tweet_id                          80 non-null     int64 
 2   day                               80 non-null     object
 3   time                              80 non-null     object
 4   tweet                             80 non-null     object
 5   clean_tweet_lemma                 80 non-null     object
 6   clean_tweet_lemma_postags         80 non-null     object
 7   clean_tweet_nostop_lemma          80 non-null     object
 8   clean_tweet_nostop_lemma_postags  80 non-null     object
 9   tweet_favorite_count              80 non-null     int64 
 10  tweet_retweet_count               80 non-null     int64 
 11  tweet_source                      80 non-null     object
 12  user_id                 

In [5]:
tweet = df['tweet'][0]
clean_tweet_lemma = df['clean_tweet_lemma'][0]
clean_tweet_lemma_postags = df['clean_tweet_lemma_postags'][0]
clean_tweet_nostop_lemma = df['clean_tweet_nostop_lemma'][0]
clean_tweet_nostop_lemma_postags = df['clean_tweet_nostop_lemma_postags'][0]
print(tweet)
print(clean_tweet_lemma)
print(clean_tweet_lemma_postags)
print(clean_tweet_nostop_lemma)
print(clean_tweet_nostop_lemma_postags)

"@USER AAABDVSGJS NOO you're too kind 😭🥺 more like i plagued everyone 😤 me too aaa gdjshvdh im super happy i met you and enjoyed myself in this space 🥺🥰💞💕"
' @usuario aaabdvsgjs noo you be too kind 😭 🥺 more like I plague everyone 😤 I too aaa gdjshvdh I m super happy I meet you and enjoy myself in this space 🥺 🥰 💞 💕 '
'_PUNCT @usuario_ADV aaabdvsgjs_PROPN noo_PROPN you_PRON be_AUX too_ADV kind_ADV 😭_ADJ 🥺_PROPN more_ADV like_ADP I_PRON plague_VERB everyone_PRON 😤_PUNCT I_PRON too_ADV aaa_PROPN gdjshvdh_PROPN I_PRON m_VERB super_ADV happy_ADJ I_PRON meet_VERB you_PRON and_CCONJ enjoy_VERB myself_PRON in_ADP this_DET space_NOUN 🥺_PROPN 🥰_PROPN 💞_NOUN 💕_X '_PUNCT
' @usuario aaabdvsgjs noo you be kind 😭 🥺 like I plague everyone 😤 I aaa gdjshvdh I m super happy I meet you enjoy myself space 🥺 🥰 💞 💕 '
'_PUNCT @usuario_ADV aaabdvsgjs_PROPN noo_PROPN you_PRON be_AUX kind_ADV 😭_ADJ 🥺_ADJ like_ADP I_PRON plague_VERB everyone_PRON 😤_PROPN I_PRON aaa_NOUN gdjshvdh_PROPN I_PRON m_VERB super_ADV happ