In [None]:
#import packages, load config file and do twitter authentication
import configparser 
import tweepy
import time
import re
import pandas as pd
import os
import numpy as np
import preprocessing as pp
#from sklearn import cross_validation
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
import random

# get config file
config = configparser.ConfigParser()
config.read("config.ini")

# read twitter's 4 oauth elements from config file , assuming you have them!
consumer_key = config.get('twitter', 'consumer_key' )
consumer_secret = config.get('twitter', 'consumer_secret' )
access_key = config.get('twitter', 'access_key' )
access_secret = config.get('twitter', 'access_secret' )

#twitter_authentication
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)

#get stopwords path
stopwords_path = config.get('preprocessing', 'stopwords_path' )

In [None]:
#get bad tweets


#function to get tweet text from tweepy status
def get_tweetText(status) :   
    if 'extended_tweet' in status._json:
      return status.extended_tweet['full_text']
    else:
      return status.text

# get tweets
api = tweepy.API(auth)
fa_bad_words_file = config.get('data_gathering'  , 'fa_swear_words_file')
fa_bad_words = pd.read_csv(fa_bad_words_file , header=None)
fa_bad_words.columns = ['word']
max_tweets =  1#0 
all_bad_tweets = []
for bad_word in fa_bad_words['word'] : 
    try:
        print ('--------------- getting tweets containing "'+ bad_word + '"------------------')
        searched_tweets = [status for status in tweepy.Cursor(api.search, q=bad_word, lang = 'fa').items(max_tweets)]
        tweets = [[get_tweetText(tweet)] for tweet in searched_tweets]
        all_bad_tweets.extend(tweets)
        time.sleep(30) # Delay for 5 seconds to avoid 'Max retries exceeded' error!
    except Exception as err:
        print (err)        
        time.sleep(60) # in case of 'Max retries exceeded' error Delay for 60 seconds, then continue
        continue



# clean tweets from # and user ids 
all_bad_tweets_clean= [tweet[0].replace("#" , "").replace("," , " ").replace("\n" , '') for tweet in all_bad_tweets ]
all_bad_tweets_clean= [re.sub('RT @[^\s]+','',tweet) for tweet in all_bad_tweets_clean ]
all_bad_tweets_clean= [re.sub('@[^\s]+','',tweet) for tweet in all_bad_tweets_clean ]

# create a datafrme
bad_tweets_df = pd.DataFrame(all_bad_tweets_clean)
bad_tweets_df['isOffensive'] = bad_tweets_df.apply(lambda x: 1, axis=1) 
bad_tweets_df.columns= ['text' , 'isOffensive']

# save results
# if file does not exist write header 
bad_tweets_filename = config.get('twitter', 'bad_tweets_file' )
#if file doesn't exist create it
if not os.path.isfile(bad_tweets_filename):
   bad_tweets_df.to_csv(bad_tweets_filename, header=False)
else: # else it exists so append to it
   bad_tweets_df.to_csv(bad_tweets_filename, mode='a', header=False)


In [None]:
#get good tweets 

good_users_file = config.get('data_gathering'  , 'good_users_file')
good_users = pd.read_csv(good_users_file , header=None)
good_users.columns = ['userId']
users =good_users['userId'] 
    
api = tweepy.API(auth)
all_good_tweets =[] 
num_user_tweets = 1#00 
for usr in users:
    try:
        print('--------------- getting tweets from "'+ usr + '" ------------------')
        user_tweets = api.user_timeline(screen_name = usr , count = num_user_tweets, include_rts = True)
        for tweet in user_tweets:
            all_good_tweets.extend([get_tweetText(tweet)])
    except Exception as e:
        print (str(e))
        continue
        
# clean tweets from # and user ids 
all_good_tweets_clean= [tweet[0].replace("#" , "").replace("," , " ").replace("\n" , '') for tweet in all_good_tweets ]
all_good_tweets_clean= [re.sub('RT @[^\s]+','',tweet) for tweet in all_good_tweets_clean ]
all_good_tweets_clean= [re.sub('@[^\s]+','',tweet) for tweet in all_good_tweets_clean ]

# create a datafrme
good_tweets_df = pd.DataFrame(all_bad_tweets_clean)
good_tweets_df['isOffensive'] = bad_tweets_df.apply(lambda x: 0, axis=1) 
good_tweets_df.columns= ['text' , 'isOffensive']

# save results
# if file does not exist write header 
good_tweets_filename = config.get('twitter', 'good_tweets_file' )
#if file doesn't exist create it
if not os.path.isfile(good_tweets_filename):
   good_tweets_df.to_csv(good_tweets_filename, header=False)
else: # else it exists so append to it
   good_tweets_df.to_csv(good_tweets_filename, mode='a', header=False)


In [None]:
#read bad and good tweets from file 
bad_tweets = pd.read_csv(config.get('data_gathering'  , 'bad_tweets_file'))
good_tweets = pd.read_csv(config.get('data_gathering'  , 'good_tweets_file'))
    
bad_tweets.columns = [ 'ID' ,'text' , 'isOffensive' ]
good_tweets.columns = [ 'ID' , 'text' , 'isOffensive' ]
        
indexNames = bad_tweets[ bad_tweets['text'].str.len() == 0  ].index 
# Delete these row indexes from dataFrame
bad_tweets.drop(indexNames , inplace=True)    
bad_tweets = bad_tweets[bad_tweets['text'].notnull()]
    
    
indexNames = good_tweets[ good_tweets['text'].str.len() == 0  ].index 
# Delete these row indexes from dataFrame
good_tweets.drop(indexNames , inplace=True)    
good_tweets = good_tweets[bad_tweets['text'].notnull()]
    
    
    
tweets_df = pd.concat([bad_tweets, good_tweets])    
tweets_df.head()

In [None]:

docs = tweets_df["text"].values
tweets_df["preprocessed"] = pp.cleanText(docs , stopwords_path)


In [None]:
# splitting data into training and test sets 

def shuffle(df):
    index = list(df.index)
    random.shuffle(index)
    df = df.ix[index]
    df.reset_index()
    return df

#shuffle rows 
tweets_df = shuffle(tweets_df)

#get test and train sets
msk = np.random.rand(len(tweets_df)) < 0.8

train_tweets_df = tweets_df[msk]
test_tweets_df = tweets_df[~msk]

print('number of train examples : ' +  str(len(train_tweets_df)))
print('number of train examples : '  + str(len(test_tweets_df)))


In [None]:


df = pd.read_csv(stopwords_path, delimiter=',')
stop_words = df["stopwords"].values

texts = [ ' '.join(tweet_tokens)  for tweet_tokens in  train_tweets_df["preprocessed"] ]
y = train_tweets_df['isOffensive'].tolist()

vectorizer = CountVectorizer(stop_words.tolist(), min_df=0.0000001)
X = vectorizer.fit_transform(texts)

# Train the model
model = LinearSVC(class_weight="balanced", dual=False, tol=1e-2, max_iter=1e5)
cclf = CalibratedClassifierCV(base_estimator=model)
cclf.fit(X, y)

# Save the model
joblib.dump(vectorizer, 'vectorizer.joblib')
joblib.dump(cclf, 'model.joblib') 

#vectorizer = joblib.load(pkg_resources.resource_filename('profanity_check', 'data/vectorizer.joblib'))
#model = joblib.load(pkg_resources.resource_filename('profanity_check', 'data/model.joblib'))
import numpy as np
model = cclf
def _get_profane_prob(prob):
  return prob[1]

def predict(texts):
  return model.predict(vectorizer.transform(texts))

def predict_prob(texts):
  return np.apply_along_axis(_get_profane_prob, 1, model.predict_proba(vectorizer.transform(texts)))




test_df = pd.DataFrame(  [ test_tweets_df['isOffensive'].tolist()  
                    , predict_prob( [ ' '.join(tweet_tokens)  for tweet_tokens in  test_tweets_df["preprocessed"].tolist() ])
                    , predict( [ ' '.join(tweet_tokens)  for tweet_tokens in  test_tweets_df["preprocessed"].tolist() ])]).T
test_df.columns = ['isOffensive' , 'isOffensive_pred_prob' ,  'isOffensive_pred' ]
test_df.head()








In [None]:
#Computing Precision and Recall
precision = precision_score( test_tweets_df['isOffensive'].tolist()  , predict( [ ' '.join(tweet_tokens)  for tweet_tokens in  test_tweets_df["preprocessed"].tolist() ]))
recall = recall_score( test_tweets_df['isOffensive'].tolist()  , predict( [ ' '.join(tweet_tokens)  for tweet_tokens in  test_tweets_df["preprocessed"].tolist() ]))
print('precision : ' , ("%.2f" %  precision ))
print('recall : ' , ("%.2f" %  recall ) )
     