In [216]:
import pickle
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import re

In [217]:
pd.options.display.max_rows = 100

In [218]:
# Change projectdir to the path of the main project
projectdir = '/Users/Toavina/githubdata'
subfolderdir = os.path.join(projectdir,'3.gh_users_filter')
picklesdir = os.path.join(subfolderdir,'1.pickles')

In [219]:
# Load aggregate dataframe
agg_df = pickle.load(open(os.path.join(picklesdir,'agg_df.pkl'),'rb'))

In [220]:
agg_df_w_names = agg_df[['name','location','bio','blog','company',
        'created_at','updated_at','email','followers',
       'following','hireable','login',
        'public_gists','public_repos',
       'site_admin', 'type']][~pd.isnull(agg_df['name'])]

In [221]:
agg_df_w_names

Unnamed: 0,name,location,bio,blog,company,created_at,updated_at,email,followers,following,hireable,login,public_gists,public_repos,site_admin,type
0,Dash O'Pepper,,,,,2009-04-19T23:52:47Z,2014-04-11T21:48:45Z,,6,0,,-,0,1,False,User
3,0--1,"Los Angeles, CA",,https://0--1.github.io,,2014-10-02T07:51:25Z,2016-07-12T17:06:15Z,,0,0,,0--1,0,4,False,User
4,Anton Kosinov,,,,,2011-01-09T18:12:02Z,2016-11-14T12:20:42Z,a.s.kosinov@gmail.com,0,0,True,0--key,0,14,False,User
9,lanyy,,,,,2015-05-30T07:33:40Z,2016-09-24T12:38:20Z,,10,1,,0-0-9,0,9,False,User
10,Dmitiry Usov,Russia,,,,2013-03-15T04:08:00Z,2016-10-24T16:02:55Z,,0,0,,0-0Flash0-0,1,3,False,User
12,Nickolay Yegorov,"Vancouver, Canada",,http://rocketscience.ml,RocketScience,2010-10-16T15:20:56Z,2016-11-06T01:44:48Z,nickolay.yegorov@gmail.com,12,7,True,0-1-0,6,24,False,User
15,何幺鸡,Xi'an,,,Xidian University,2013-09-10T08:00:10Z,2016-08-02T07:17:05Z,,0,6,,0-1heyi,1,5,False,User
20,Tim Hawthorn,"Glastonbury, UK",,http://grailtest.pathilorra.co.uk/,,2014-04-01T18:27:23Z,2016-09-17T21:39:39Z,afflatus@pathilorra.co.uk,2,3,,0-afflatus,4,11,False,User
21,Juan Rojas,República Dominicana,,,,2014-07-19T21:40:19Z,2016-11-21T13:48:25Z,jfco.rojas@hotmail.com,0,1,,0-cool,0,13,False,User
22,Eclipse,"Иркутск, Россия (Irkutsk, Russia)",I'm a programmer. That's all.,,Student,2015-09-16T00:15:53Z,2016-11-21T05:54:52Z,,11,0,,0-Eclipse-0,0,69,False,User


## Filter names

In [222]:
#Remove users with numerical characters and no space (i.e. one single word)
# re.search([0-9], agg_df_w_names.loc[3]['name'])

def filter_names(string):
    """returns True if want to remove certain conditions"""
    
    #Subfunctions to search for invalid names 
    
    # search for no space
    def no_space(string):
        if len(string.split(' ')) == 1:
            return True
        else:
            return False

    # filter for names that are too short
    def too_short(string):
        if len(string) < 6:
            return True
        else:
            return False
        
    too_short_search = too_short(string)
    no_space_search = no_space(string)
    # search for a number in the string
    num_search = re.search("\d",string)
    at_search = re.search("@",string)

    if num_search or at_search or no_space_search == True \
    or too_short_search == True:
        return False
    else:
        return True

name_mask = agg_df_w_names['name'].apply(filter_names)

# Filter locations

## List items to filter

In [223]:
asian_countries = 'Afghanistan, Armenia, Azerbaijan, Bahrain, Bangladesh, Bhutan, Brunei, Cambodia, \
China, Cyprus, Georgia, India, Indonesia, Iran, Israel, Japan, Jordan, Kazakhstan, Korea, \
Kuwait, Kyrgyzstan, Laos, Lebanon, Malaysia, Maldives, Mongolia, Myanmar, Nepal, \
Oman, Pakistan, Philippines, Qatar, Russia, Saudi Arabia, Singapore, Sri Lanka, Syria, Tajikistan, \
Thailand, Timor, Turkey, Turkmenistan, United Arab Emirates, Uzbekistan, Vietnam, Yemen'

african_countries = 'Algeria, Angola, Benin, Botswana, Burkina Faso, Burundi, Cabo Verde, Cameroon, \
Central African Republic, Chad, Comors, Congo, Ivoire, Djibouti, Egypt, Guinea, Eritrea, Ethiopia, \
Gabon, Gambia, Ghana, Guinea, Guinea-Bissau, Kenya, Lesotho, Liberia, Libya, Madagascar, Malawi, Mali, \
Mali, Mauritania, Mauritius, Morocco, Mozambique, Namibia, Niger, Nigeria, Rwanda, Sao Tome and Principe, \
Senegal, Seychelles, Sierra Leone, Somalia, South Africa, South Sudan, Sudan, Swaziland, Tanzania, Togo, \
Tunisia, Uganda, Zambia, Zimbabwe'

latam_countries = 'Antigua, Argentina, Bahamas, Barbados, Belize, Bolivia, Brazil, Brasil, Chile, Colombia, \
Costa Rica, Cuba, Dominica, Dominican, Dominicana, Ecuador, El Salvador, Grenada, Guatemala, \
Guyana, Haiti, Honduras, Jamaica, Mexico, Nicaragua, Panama, Paraguay, Peru, Perú, Saint Kitts and Nevis, \
Saint Lucia, Saint Vincent and the Grenadies, Suriname, Trinidad and Tobago, Uruguay, Venezuela'


certain_cities = 'Shanghai, Beijing, Delhi, Hyderabad, Bangalore, Xian, Shenzen, Guangzhou, Hai Duong, \
Phnom Penh, Durban, Cape Town, Dhaka, Kampala, Gaborone, Accra, Rio, Buenos Aires, Guadalajara, Nanjin'


asian_countries_list = asian_countries.split(', ')
african_countries_list = african_countries.split(', ')
latam_countries_list = latam_countries.split(', ')
certain_cities_list = certain_cities.split(', ')

## Function to create regex string to match values in a list

In [224]:
def create_list_regex(item_list):
    """Creates a regular expression string to be used to match items in the list"""
    string = ''
    for i in range(len(item_list) - 1):
        string += '(?i)' + item_list[i] + '(?i)' + "|" 
    # Append last item in list
    string += '(?i)' + item_list[-1] + '(?i)'
    return string

## Filter Locations

In [225]:
def filter_locations(string):
    """Removes locations for which regex objects are found"""
    
    try:
        asian_countries_search = re.search(create_list_regex(asian_countries_list), string)
        african_countries_search = re.search(create_list_regex(african_countries_list), string)
        latam_countries_search = re.search(create_list_regex(latam_countries_list), string)
        certain_cities_search = re.search(create_list_regex(certain_cities_list), string)
 
        if asian_countries_search or african_countries_search or latam_countries_search \
        or certain_cities_search or one_letter_surname_search:
            return False
        else:
            return True

    except:
        return True

In [226]:
location_mask = agg_df_w_names['location'].apply(filter_locations)

## Filtered list (location and name filtered)

In [227]:
agg_df_w_names[name_mask][location_mask]

  if __name__ == '__main__':


Unnamed: 0,name,location,bio,blog,company,created_at,updated_at,email,followers,following,hireable,login,public_gists,public_repos,site_admin,type
0,Dash O'Pepper,,,,,2009-04-19T23:52:47Z,2014-04-11T21:48:45Z,,6,0,,-,0,1,False,User
4,Anton Kosinov,,,,,2011-01-09T18:12:02Z,2016-11-14T12:20:42Z,a.s.kosinov@gmail.com,0,0,True,0--key,0,14,False,User
12,Nickolay Yegorov,"Vancouver, Canada",,http://rocketscience.ml,RocketScience,2010-10-16T15:20:56Z,2016-11-06T01:44:48Z,nickolay.yegorov@gmail.com,12,7,True,0-1-0,6,24,False,User
20,Tim Hawthorn,"Glastonbury, UK",,http://grailtest.pathilorra.co.uk/,,2014-04-01T18:27:23Z,2016-09-17T21:39:39Z,afflatus@pathilorra.co.uk,2,3,,0-afflatus,4,11,False,User
25,Jake Mitchell,United Kingdom,,,,2015-03-03T23:15:30Z,2016-09-26T13:53:25Z,,0,0,True,0-jake-0,0,0,False,User
34,Dmitri Iouchtchenko,"Waterloo, Canada",,https://d.i10o.ca/,,2009-10-16T21:04:22Z,2016-11-14T05:43:31Z,johnnyspoon@gmail.com,32,20,,0,12,51,False,User
35,Arjun B,,,https://play.google.com/store/apps/developer?i...,,2013-10-26T13:12:07Z,2016-11-13T06:42:59Z,,7,4,True,00-00-00,0,16,False,User
37,Ivan Konstantynov,Kiev,,,AppFellas,2013-11-21T17:10:55Z,2016-11-18T20:29:44Z,,1,3,,00000111,0,9,False,User
51,Junteng Jia,,,,,2015-03-27T08:07:31Z,2016-09-14T05:40:52Z,,0,4,,000Justin000,0,18,False,User
60,Mukesh Shrestha,,,,,2013-10-15T17:48:25Z,2016-02-27T11:43:13Z,rashbarri@gmhttps://github.com/001001001ail.com,0,0,,001001001,0,3,False,User


## Save filtered list

In [228]:
pickle.dump( agg_df_w_names[name_mask][location_mask] ,open(os.path.join(picklesdir,'filtered_users_df.pkl'),'wb'))

  if __name__ == '__main__':
