In [1]:
import sys
import os 
sys.path.append('../../../Mental_Disorder/3_feature_visualization') # get old tweets library
import age_gender_predictor
from collections import defaultdict
import re
from tabulate import tabulate
from pymongo import MongoClient
import numpy as np
import pandas as pd

# Get user tweets
Read user tweets file & summerize the tweets content to query the age and gender

In [2]:
# Read Patients
def readPatient(folder, filename):
    with open(folder + filename, 'r') as openfile:
#         screen_name<tab>screen_name<tab>tweet_time<tab>tweet_content
        return [line.strip().split('\t') for line in openfile.readlines()]
def checkFolderFile(folder):
    return os.listdir(folder)

In [3]:
print('\n Reading patient tweets..')
folder = '../../twitter crawler/patient_tweets/'
print(folder)
patient_list = checkFolderFile(folder)
patient_tweets_dict = defaultdict(lambda : [])
for patient_name in patient_list:
    patient_tweets_dict[patient_name] = readPatient(folder, patient_name)
    
print(' Patient Number from tweets folder:' + str(len(patient_tweets_dict)))


 Reading patient tweets..
../../twitter crawler/patient_tweets/
 Patient Number from tweets folder:345


### Regular people

In [4]:
def getLangRatio(cursor):
    lang_ratios = {}
    for tweet in cursor:
        lang = 1 if tweet["lang"] == "en" else 0
        user_id = tweet["user"]["id"]
        if user_id in lang_ratios:
            lang_ratios[user_id].append(lang)
        else:
            lang_ratios[user_id] = [lang]
    for user_id, ratio in lang_ratios.items():
        lang_ratios[user_id] = np.sum(ratio) / len(ratio)
    return lang_ratios

def getUsersTweets(dbName,collectionName, en_threshold=0.9):
    cursor = MongoClient("localhost", 27017)[dbName][collectionName].find()
    lang_ratios = getLangRatio(cursor)

    cursor = MongoClient("localhost", 27017)[dbName][collectionName].find()
    usersTweets = {}
    for tweet in cursor:
        userID = tweet["user"]["id"]
        if lang_ratios[userID] < en_threshold:
            continue
        #Processing emotions from Carlos' API
        emotion =  tweet["emotion"]["groups"][0]["name"]
        if len(tweet["emotion"]["groups"]) > 1:
            emotion_2 = tweet["emotion"]["groups"][1]["name"]
            
        ambiguous = True if tweet['emotion']['ambiguous'] == 'yes' else False
       
        if len(tweet["emotion"]["groups"]) > 1:
            emotion_2 = tweet["emotion"]["groups"][1]["name"]    
        else:
            emotion_2 = None
        if tweet["polarity"] == "positive":
            polarity = 1
        elif tweet["polarity"] == "negative":
            polarity = -1
        else:
            polarity = 0
   
            
        date = tweet["created_at"]
        text = tweet['text']

        if userID not in usersTweets:
            usersTweets[userID] = {}
        if date not in usersTweets[userID]:
            usersTweets[userID][date] = {}
            
        usersTweets[userID][date]['text'] = text
        usersTweets[userID][date]['polarity'] =  polarity
        usersTweets[userID][date]['emotion'] =  emotion
        usersTweets[userID][date]['emotion_2'] =  emotion_2
        usersTweets[userID][date]['ambiguous'] =  ambiguous
    return usersTweets

def timeSeriesTransform(usersEmotions):
    for userID in usersEmotions:
        usersEmotions[userID] = pd.DataFrame.from_dict(usersEmotions[userID], orient='index').fillna(0)
        usersEmotions[userID]['dt'] = np.zeros(usersEmotions[userID].shape[0],dtype=float)
        usersEmotions[userID].loc[:-1,'dt'] = (usersEmotions[userID].index[1:].values - usersEmotions[userID].index[:-1].values).astype('timedelta64[s]') / np.timedelta64(60, 's')
    return list(usersEmotions.values())

def getHTTPRows(timeSeries):
    count = 0
    patterns = ['http://','https://']
    conditions = timeSeries['text'].str.contains(patterns[0])
    for pattern in patterns[1:]:
        conditions = conditions | timeSeries['text'].str.contains(pattern)

    return conditions.values

def userFilter(group, spam_threshold=0.5,tweets_threshold=100):    #Spam and inactive user filter
    new_group = []
    for timeSeries in group:
        http_rows = getHTTPRows(timeSeries)
        average_http_count = np.sum(http_rows) / timeSeries.shape[0]
        if (average_http_count < spam_threshold) and (timeSeries.shape[0] > tweets_threshold):
            new_group.append(timeSeries)
    return new_group

In [5]:
regular_tweets =  getUsersTweets("eric","regularUser_en_fixed_emotion")

In [6]:
regular_timeSeries = timeSeriesTransform(regular_tweets)
regular_clean = userFilter(regular_timeSeries)

## Gender/ Age query function

In [7]:
#Age and gender Distribution
def getAge(timeSeries):
    # This function returns a float, representing the age. 
    texts = ""
    for text in timeSeries:
        texts += text + "\n"
    return age_gender_predictor.get_age(texts)

def getGender(timeSeries):
    # This function returns a float. Positive valuse represents female and vice versa.
    texts = ""
    for text in timeSeries:
        texts += text + "\n"
    return age_gender_predictor.get_gender(texts)

def getRegularAge(timeSeries):
    texts = ""
    for text in timeSeries["text"].values:
        texts += text + "\n"
    return age_gender_predictor.get_age(texts)

def getRegularGender(timeSeries):
    texts = ""
    for text in timeSeries["text"].values:
        texts += text + "\n"
    return age_gender_predictor.get_gender(texts)

def userValueDistribution(groups, x_name ,  method, legends, colors, bins=50):
    _, plot_axis = plt.subplots(2, 2)
    plot_sequence = ((0,0),(0,1),(1,0),(1,1))
    for g,group in enumerate(groups):
        
        values = [method(timeSeries) for timeSeries in group]
          
        
        plot_axis[plot_sequence[g]].hist(values ,color=colors[g], bins = bins, edgecolor='none' )
        plot_axis[plot_sequence[g]].set_ylabel('User count')
        plot_axis[plot_sequence[g]].set_xlabel(x_name)
        plot_axis[plot_sequence[g]].set_title(str(len(group)) + " " + legends[g] + " people")

        #print("Total People: {} std:".format(len(group)))
        #print("{0} mean :{1:2f} STD:{2:2f}".format(x_name ,np.mean(values),np.std(values)))
    plt.tight_layout()
    plt.show()
    
def getAgeGender(users_tweets_list):
    features = defaultdict(lambda : defaultdict(lambda:None))
    for timeSeries in users_tweets_list:
#         timeSeries = (name, tweets_list)
        features[timeSeries[0]]["age"] = getAge(timeSeries[1])
        features[timeSeries[0]]["gender"] = getGender(timeSeries[1])

    return features


## Organizing file and send to query

In [8]:
# function to delete url
def del_url(line):
    return re.sub(r'(\S*(\.com).*)|(https?:\/\/.*)', "", line)
# replace tag
def checktag(line): 
    return re.sub(r'\@\S*', "", line)

### Prepare query list

In [9]:
# (name, tweets_list)
users_tweets_content_list = []
for user in patient_tweets_dict:
    each_content_list = []
    for tweets in patient_tweets_dict[user]:
#   tweets = screen_name<tab>screen_name<tab>tweet_time<tab>tweet_content
        try:
            each_content_list.append(del_url(checktag(tweets[3])))
        except:
#             print(tweets)
            continue
    users_tweets_content_list.append((user, each_content_list))

### Get Age/ Gender

In [10]:
user_age_gender_dict = getAgeGender(users_tweets_content_list)

In [12]:
headers = ["User","Age", "Gender(over 0:female)"]
contents = []
bd_gender = []
bd_age = []
for uid, user in enumerate(user_age_gender_dict):
    age = user_age_gender_dict[user]["age"]
    gender = user_age_gender_dict[user]["gender"]
    bd_age.append(age)
    bd_gender.append(gender)
    contents.append([uid, age, gender])
print(tabulate(contents, headers=headers))

  User       Age    Gender(over 0:female)
------  --------  -----------------------
     0   31.9489               0.377751
     1   17.8544               2.30979
     2   32.8092              -0.834685
     3   20.0838               2.12366
     4   22.2928               2.56852
     5   24.7427              -2.03607
     6   31.7078              -1.58208
     7   17.5027               2.59505
     8   22.2678               2.37205
     9   25.7346              -1.25254
    10   21.0283               0.59909
    11   19.276                1.18466
    12   33.1247              -0.775114
    13   29.0576               2.3448
    14   23.1485               0.952546
    15   46.4422              -1.01922
    16   36.6689              -1.20518
    17   16.2564               1.71655
    18   17.8545               2.58694
    19   29.2131               4.61822
    20   22.3657               0.0943453
    21   26.5654              -2.40137
    22   42.0974               3.19079
    23   26.90

In [20]:

# regular
regular_num = len(regular_clean)
regular_gender = [getRegularGender(timeSeries) for timeSeries in regular_clean]
regular_age = [getRegularAge(timeSeries) for timeSeries in regular_clean]




##  Organized and Show Result

In [35]:
statics_headers = ["Group","User Counts","Male users", "Female users", "Average age"]
statics_contents = []

regular_male_ratio = len([gender for gender in regular_gender if gender < 0]) / float(regular_num)
regular_female_ratio = 1 - regular_male_ratio
regular_average_age = sum(regular_age) / regular_num
statics_contents.append(["Regular", regular_num, regular_male_ratio, regular_female_ratio, regular_average_age])

# bd
bd_num = len(user_age_gender_dict)
bd_male_ratio = len([gender for gender in bd_gender if gender < 0]) / float(bd_num)
bd_female_ratio = 1 - bd_male_ratio
bd_average_age = sum(bd_age) / bd_num
statics_contents.append(["Bipolar", bd_num, bd_male_ratio, bd_female_ratio, bd_average_age])

In [36]:
print(tabulate(statics_contents, headers=statics_headers))

Group      User Counts    Male users    Female users    Average age
-------  -------------  ------------  --------------  -------------
Regular            679       0.46539         0.53461        28.9791
Bipolar            345       0.24058         0.75942        26.4956
