# Gender Prediction Based On Tweets 

In [1]:
# Import libraries
import pandas as pd
import re, os
from collections import Counter
import nltk
from nltk.corpus import stopwords, wordnet 
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix

In [2]:
# File paths and names
DATA_PATH = r'./data/'
TWITTER_ACCOUNTS_FILE = DATA_PATH + 'celebrity_twitter_accounts.csv'
TEST_TWITTER_ACCOUNTS_FILE = DATA_PATH + 'twitter_test.csv'
ORIG_TWEET_FILE = DATA_PATH + 'all_tweets.json'
TEST_TWEET_FILE = DATA_PATH + 'test_tweets.json'

In [3]:
additional_stop_words=['twitter','com','via']

female: 0, male: 1

## Load Training Data

In [4]:
# Load training celebrity twitter accounts
celebrity_twitter_accounts = pd.read_csv(TWITTER_ACCOUNTS_FILE)

In [5]:
# Number of twitter accounts
celebrity_twitter_accounts.shape[0]

37

In [6]:
celebrity_twitter_accounts.head(5)

Unnamed: 0,screenname,name,followers_in_millions,gender
0,BarackObama,Barack Obama,122,male
1,justinbieber,Justin Bieber,112,male
2,katyperry,Katy Perry,108,female
3,rihanna,Rihanna,98,female
4,Cristiano,Cristiano Ronaldo,88,male


In [7]:
# Load tweets for training
all_tweets_df = pd.read_json(ORIG_TWEET_FILE)

In [11]:
# Number of tweets
all_tweets_df.shape[0]

23454

In [12]:
# Male-Female ratio
all_tweets_df.gender.value_counts(normalize=True, sort=False)

female    0.495864
male      0.504136
Name: gender, dtype: float64

## Text Preprocessing

In [13]:
def get_wordnet_pos(word):
    """
    Map POS tag to first character lemmatize() accepts
    """
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def text_cleanup(text):  
    '''
    Text pre-processing
        return tokenized list of cleaned words
    '''
    # Convert to lowercase
    text_clean = text.lower()
    # Remove non-alphabet
    text_clean = re.sub(r'[^a-zA-Z]|(\w+:\/\/\S+)',' ', text_clean).split()    
    # Remove short words (length < 3)
    text_clean = [w for w in text_clean if len(w)>2]
    # Lemmatize text with the appropriate POS tag
    lemmatizer = WordNetLemmatizer()
    text_clean = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in text_clean]
    # Filter out stop words in English 
    stops = set(stopwords.words('english')).union(additional_stop_words)
    text_clean = ' '.join([w for w in text_clean if w not in stops])
    
    return text_clean

In [14]:
# Preprocess tweets
all_tweets_df['token'] = [text_cleanup(x) for x in all_tweets_df['text']] 
# Only take processed tweets that have more than 20 characters (too short tweet has little meaning)
tweets = all_tweets_df[all_tweets_df['token'].apply(len)>20]
tweets = tweets.reset_index(drop=True)

In [15]:
tweets.sample(n=5, random_state=11)

Unnamed: 0,screenname,text,name,gender,token
11365,Simone_Biles,wish I could turn my thoughts off so I could g...,Simone Biles,female,wish could turn thought could sleep already
60,justinbieber,some will hate....more will want to love. Lett...,Justin Bieber,male,hate want love let love ignore hate keep positive
10827,serenawilliams,"@celebsdontreply. Of course, I reply.",Serena Williams,female,celebsdontreply course reply
18066,BillGates,"Everyone should know about the #MDGs, 8 goals ...",Bill Gates,male,everyone know mdgs goal revolutionize fight po...
3081,HillaryClinton,Bill and I were saddened to hear of the passin...,Hillary Clinton,female,bill sadden hear passing wilson roosevelt jerm...


In [16]:
# Number of tweets after preprocessing
tweets.shape[0]

19663

In [17]:
# Male-Female ratio
tweets.gender.value_counts(normalize=True, sort=False)

female    0.496313
male      0.503687
Name: gender, dtype: float64

## Training Logistic Regression Using TF-IDF Features

In [18]:
# Term Frequency-Inverse Document Frequency (TF-IDF) features
text_transformer = TfidfVectorizer()
X = text_transformer.fit_transform(tweets['token'])
y = tweets['gender']

In [37]:
# Splitting data into training and validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=39)

In [38]:
# Train using Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [39]:
# Validation result
y_val_pred = logreg.predict(X_val)

In [40]:
confusion_matrix(y_val, y_val_pred)

array([[1539,  413],
       [ 492, 1489]], dtype=int64)

In [41]:
f1_score(y_val, y_val_pred, average='micro')

0.7698957538774472

## Gender Prediction On Test Data

In [24]:
# Import test twitter account
test_twitter_accounts = pd.read_csv(r'./data/twitter_test.csv')

In [25]:
test_twitter_accounts.head()

Unnamed: 0,screenname,name,gender
0,StephenCurry30,Stephen Curry,male
1,BarbaraCorcoran,Barbara Corcoran,female
2,DUALIPA,DUALIPA,female
3,Mike_Pence,Mike Pence,male


In [51]:
# Load test tweets
test_tweets_df = pd.read_json(TEST_TWEET_FILE)

In [52]:
# Preprocess tweets
test_tweets_df['token'] = [text_cleanup(x) for x in test_tweets_df['text']] 
test_tweets = test_tweets_df[test_tweets_df['token'].apply(len)>20]
test_tweets = test_tweets.reset_index(drop=True)

In [53]:
# Number of test tweets
test_tweets.shape[0]

208

In [54]:
# Male-Female ratio
test_tweets.gender.value_counts(normalize=True, sort=False)

female    0.504808
male      0.495192
Name: gender, dtype: float64

In [55]:
# TF-IDF features
bow = text_transformer.transform(test_tweets['token'])
df_bow_test = pd.DataFrame(bow.todense(), columns=text_transformer.get_feature_names())

In [56]:
# Predict probability
pred_prob = pd.DataFrame(logreg.predict_proba(df_bow_test))
# Predict classification
pred = pd.DataFrame(data=logreg.predict(df_bow_test), columns=['pred'])
# Merge into the same DataFrame
result = pd.concat([test_tweets, pred, pred_prob], axis=1, sort=False)

In [57]:
# Prediction performance
confusion_matrix(test_tweets.gender, pred)

array([[58, 47],
       [38, 65]], dtype=int64)

In [58]:
f1_score(test_tweets.gender, pred, average='micro')

0.5913461538461539

In [63]:
result.sample(n=10, random_state=42)

Unnamed: 0,screenname,text,name,gender,token,pred,0,1
161,BarbaraCorcoran,All new #BusinessUnusual #podcast episode! I'm...,Barbara Corcoran,female,new businessunusual podcast episode share top ...,female,0.58555,0.41445
15,DUALIPA,OVER 2 BILLION STREAMS ON MY BABY FUTURE NOSTA...,DUALIPA,female,billion stream baby future nostalgia month old...,female,0.519296,0.480704
73,StephenCurry30,Grab your üçø and üëÄ the #UltimateHomeChampionshi...,Stephen Curry,male,grab ultimatehomechampionship stream free youtube,male,0.326881,0.673119
96,StephenCurry30,That was about as many minutes as we played th...,Stephen Curry,male,many minute played year klaythompson,male,0.409329,0.590671
166,StephenCurry30,Make sure you challenge yourself! Not every dr...,Stephen Curry,male,make sure challenge every drill gonna pretty s...,male,0.368265,0.631735
9,DUALIPA,Lebanon is on its knees and needs us more than...,DUALIPA,female,lebanon knee need ever yesterday explosion bei...,female,0.702905,0.297095
100,StephenCurry30,Praying for you and your entire family @KarlTo...,Stephen Curry,male,pray entire family karltowns aint word man sorry,male,0.395304,0.604696
135,BarbaraCorcoran,The best thing about all of us wrestling throu...,Barbara Corcoran,female,best thing wrestling terrible chapter give pau...,female,0.500326,0.499674
18,DUALIPA,cinnamon toast crunch https://t.co/cG2mhk6rdT,DUALIPA,female,cinnamon toast crunch,female,0.64278,0.35722
148,BarbaraCorcoran,Check out the toughest tasks first. I've learn...,Barbara Corcoran,female,check toughest task first learn confront afrai...,male,0.418067,0.581933


Predict a person's gender by aggregating all predictions for same twitter account

In [67]:
result_count = result.groupby(['screenname','pred'], as_index=False)['token'].count().max(level=0)
result_count = result_count.iloc[result_count.groupby('screenname').token.idxmax()].reset_index(drop=True)

In [68]:
pred_per_person = test_twitter_accounts.set_index("screenname").join(result_count.set_index("screenname"))
pred_per_person[["gender", "pred"]]

Unnamed: 0_level_0,gender,pred
screenname,Unnamed: 1_level_1,Unnamed: 2_level_1
StephenCurry30,male,male
BarbaraCorcoran,female,male
DUALIPA,female,female
Mike_Pence,male,male
