wikiteams · oskar-j · Sep 8, 2013 · Sep 8, 2013 · Sep 8, 2013 · Sep 8, 2013
diff --git a/960 users - get their pull requests.csv b/960 users - get their pull requests.csv
diff --git a/README.md b/README.md
@@ -7,4 +7,69 @@ Copyright (C) 2013 - WikiTeams contributors
 
 py-github-wikiteams is for free. You don't have to pay for it, and you can use it any way you want. It is developed as an Open Source project under the GNU General Public License (GPL). That means you have full access to the source code of this program. You can find it on our website at https://github.com/wikiteams/py-github-wikiteams Should you wish to modify or redistribute this program, or any part of it, you should read the full terms and conditions set out in the license agreement before doing so. A copy of the license is available on our website. If you simply wish to install and use this software, you need only be aware of the disclaimer conditions in the license, which are set out below. NO WARRANTY Because the program is licensed free of charge, there is no warranty for the program, to the extent permitted by applicable law. Except when otherwise stated in writing the copyright holders and/or other parties provide the program "as is" without warranty of any kind, either expressed or implied, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose. The entire risk as to the quality and performance of the program is with you. Should the program prove defective, you assume the cost of all necessary servicing, repair or correction. In no event unless required by applicable law or agreed to in writing will any copyright holder, or any other party who may modify and/or redistribute the program as permitted above, be liable to you for damages, including any general, special, incidental or consequential damages arising out of the use or inability to use the program (including but not limited to loss of data or data being rendered inaccurate or losses sustained by you or third parties or a failure of the program to operate with any other programs), even if such holder or other party has been advised of the possibility of such damages.
 
-This script allows to parse users of 1000 "biggest repositories" using GitHub API
+###This script allows to parse users skills statistics (by pull requests) of 1000 "most active GitHub users" using GitHub API and Google BigQuery
+
+*Branches: head, pygithub*
+
+Newest data and changes allways on pygithub
+
+#### Data already collected:
+
+###### 960 users - get their pull requests.csv
+
+*It holds all PULL REQUESTS, in a format: `repository name`, `count of skill (language)`, `skill (language) name`, `user`*
+
+###### top-users-final.csv
+
+*It holds most active GitHub users (by paulmillr conditions) and their 3 most often used languages*
+
+###### users-repos-skills.csv
+
+*It holds also their repos*
+
+######logins-only.csv
+
+*It holds only their logins*
+
+##### Google bigquery
+
+*It works by querying Google GitHub timeline for fields:* `repository_name`, `count(payload_pull_request_head_repo_language)`, `payload_pull_request_head_repo_language`, `payload_pull_request_head_user_login`
+and grouping them by `payload_pull_request_head_user_login`, `payload_pull_request_head_repo_language`, `repository_name`
+
+##### Input for iterate.py script
+
+CSV file in format:
+
+`'fabpot'`
+
+`'weierophinney'`
+
+`'visionmedia'`
+
+etc. (plain username)
+
+##### Output
+
+CSV file in format:
+
+username, repo
+
+example:
+
+`fabpot, linux_kernel3`
+
+`fabpot, swap_unix`
+
+#### Learning function
+
+This is the data we input to the learning machine during LEARNING PHASE:
+
+`{user: {Skill, experience, repo}}`
+
+which means a set of users and their contribution to repositories characterized by language and intensivity (how many times contributed)
+
+and later for standard input we enter hypothetical user:
+
+`{user: {Skill, experience}}`
+
+we want on output a repository which he will probably would enjoy (repo already existing in dataset from learning phase)
diff --git a/aggregation.rmp b/aggregation.rmp
@@ -0,0 +1,60 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<process version="5.3.013">
+  <context>
+    <input/>
+    <output/>
+    <macros/>
+  </context>
+  <operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
+    <parameter key="logverbosity" value="init"/>
+    <parameter key="random_seed" value="2001"/>
+    <parameter key="send_mail" value="never"/>
+    <parameter key="notification_email" value=""/>
+    <parameter key="process_duration_for_mail" value="30"/>
+    <parameter key="encoding" value="SYSTEM"/>
+    <parameter key="parallelize_main_process" value="false"/>
+    <process expanded="true">
+      <operator activated="true" class="retrieve" compatibility="5.3.013" expanded="true" height="60" name="Retrieve 960 users - get their pull requests.csv" width="90" x="45" y="120">
+        <parameter key="repository_entry" value="//Local Repository/960 users - get their pull requests.csv"/>
+      </operator>
+      <operator activated="true" class="sort" compatibility="5.3.013" expanded="true" height="76" name="Sort" width="90" x="179" y="120">
+        <parameter key="attribute_name" value="user_login"/>
+        <parameter key="sorting_direction" value="increasing"/>
+      </operator>
+      <operator activated="true" class="aggregate" compatibility="5.3.013" expanded="true" height="76" name="Aggregate" width="90" x="313" y="120">
+        <parameter key="use_default_aggregation" value="false"/>
+        <parameter key="attribute_filter_type" value="all"/>
+        <parameter key="attribute" value=""/>
+        <parameter key="attributes" value=""/>
+        <parameter key="use_except_expression" value="false"/>
+        <parameter key="value_type" value="attribute_value"/>
+        <parameter key="use_value_type_exception" value="false"/>
+        <parameter key="except_value_type" value="time"/>
+        <parameter key="block_type" value="attribute_block"/>
+        <parameter key="use_block_type_exception" value="false"/>
+        <parameter key="except_block_type" value="value_matrix_row_start"/>
+        <parameter key="invert_selection" value="false"/>
+        <parameter key="include_special_attributes" value="false"/>
+        <parameter key="default_aggregation_function" value="average"/>
+        <list key="aggregation_attributes">
+          <parameter key="count" value="sum"/>
+        </list>
+        <parameter key="group_by_attributes" value="user_login||skill"/>
+        <parameter key="count_all_combinations" value="false"/>
+        <parameter key="only_distinct" value="false"/>
+        <parameter key="ignore_missings" value="true"/>
+      </operator>
+      <operator activated="true" class="sort" compatibility="5.3.013" expanded="true" height="76" name="Sort (2)" width="90" x="447" y="120">
+        <parameter key="attribute_name" value="user_login"/>
+        <parameter key="sorting_direction" value="decreasing"/>
+      </operator>
+      <connect from_op="Retrieve 960 users - get their pull requests.csv" from_port="output" to_op="Sort" to_port="example set input"/>
+      <connect from_op="Sort" from_port="example set output" to_op="Aggregate" to_port="example set input"/>
+      <connect from_op="Aggregate" from_port="example set output" to_op="Sort (2)" to_port="example set input"/>
+      <connect from_op="Sort (2)" from_port="example set output" to_port="result 1"/>
+      <portSpacing port="source_input 1" spacing="0"/>
+      <portSpacing port="sink_result 1" spacing="0"/>
+      <portSpacing port="sink_result 2" spacing="0"/>
+    </process>
+  </operator>
+</process>
diff --git a/aggregation_with_user_classes.rmp b/aggregation_with_user_classes.rmp
@@ -0,0 +1,91 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<process version="5.3.013">
+  <context>
+    <input/>
+    <output/>
+    <macros/>
+  </context>
+  <operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
+    <parameter key="logverbosity" value="init"/>
+    <parameter key="random_seed" value="2001"/>
+    <parameter key="send_mail" value="never"/>
+    <parameter key="notification_email" value=""/>
+    <parameter key="process_duration_for_mail" value="30"/>
+    <parameter key="encoding" value="SYSTEM"/>
+    <parameter key="parallelize_main_process" value="false"/>
+    <process expanded="true">
+      <operator activated="true" class="retrieve" compatibility="5.3.013" expanded="true" height="60" name="Retrieve 960 users - get their pull requests.csv" width="90" x="45" y="30">
+        <parameter key="repository_entry" value="//Local Repository/960 users - get their pull requests.csv"/>
+      </operator>
+      <operator activated="true" class="sort" compatibility="5.3.013" expanded="true" height="76" name="Sort" width="90" x="179" y="30">
+        <parameter key="attribute_name" value="user_login"/>
+        <parameter key="sorting_direction" value="increasing"/>
+      </operator>
+      <operator activated="true" class="aggregate" compatibility="5.3.013" expanded="true" height="76" name="Aggregate" width="90" x="313" y="30">
+        <parameter key="use_default_aggregation" value="false"/>
+        <parameter key="attribute_filter_type" value="all"/>
+        <parameter key="attribute" value=""/>
+        <parameter key="attributes" value=""/>
+        <parameter key="use_except_expression" value="false"/>
+        <parameter key="value_type" value="attribute_value"/>
+        <parameter key="use_value_type_exception" value="false"/>
+        <parameter key="except_value_type" value="time"/>
+        <parameter key="block_type" value="attribute_block"/>
+        <parameter key="use_block_type_exception" value="false"/>
+        <parameter key="except_block_type" value="value_matrix_row_start"/>
+        <parameter key="invert_selection" value="false"/>
+        <parameter key="include_special_attributes" value="false"/>
+        <parameter key="default_aggregation_function" value="average"/>
+        <list key="aggregation_attributes">
+          <parameter key="count" value="sum"/>
+        </list>
+        <parameter key="group_by_attributes" value="user_login||skill"/>
+        <parameter key="count_all_combinations" value="false"/>
+        <parameter key="only_distinct" value="false"/>
+        <parameter key="ignore_missings" value="true"/>
+      </operator>
+      <operator activated="true" class="sort" compatibility="5.3.013" expanded="true" height="76" name="Sort (2)" width="90" x="447" y="30">
+        <parameter key="attribute_name" value="user_login"/>
+        <parameter key="sorting_direction" value="decreasing"/>
+      </operator>
+      <operator activated="true" class="generate_concatenation" compatibility="5.3.013" expanded="true" height="76" name="Generate Concatenation" width="90" x="380" y="165">
+        <parameter key="first_attribute" value="skill"/>
+        <parameter key="second_attribute" value="sum(count)"/>
+        <parameter key="separator" value=":"/>
+        <parameter key="trim_values" value="true"/>
+      </operator>
+      <operator activated="true" class="aggregate" compatibility="5.3.013" expanded="true" height="76" name="Aggregate (2)" width="90" x="514" y="165">
+        <parameter key="use_default_aggregation" value="false"/>
+        <parameter key="attribute_filter_type" value="all"/>
+        <parameter key="attribute" value=""/>
+        <parameter key="attributes" value=""/>
+        <parameter key="use_except_expression" value="false"/>
+        <parameter key="value_type" value="attribute_value"/>
+        <parameter key="use_value_type_exception" value="false"/>
+        <parameter key="except_value_type" value="time"/>
+        <parameter key="block_type" value="attribute_block"/>
+        <parameter key="use_block_type_exception" value="false"/>
+        <parameter key="except_block_type" value="value_matrix_row_start"/>
+        <parameter key="invert_selection" value="false"/>
+        <parameter key="include_special_attributes" value="false"/>
+        <parameter key="default_aggregation_function" value="average"/>
+        <list key="aggregation_attributes">
+          <parameter key="skill:sum(count)" value="concatenation"/>
+        </list>
+        <parameter key="group_by_attributes" value="|user_login"/>
+        <parameter key="count_all_combinations" value="false"/>
+        <parameter key="only_distinct" value="false"/>
+        <parameter key="ignore_missings" value="true"/>
+      </operator>
+      <connect from_op="Retrieve 960 users - get their pull requests.csv" from_port="output" to_op="Sort" to_port="example set input"/>
+      <connect from_op="Sort" from_port="example set output" to_op="Aggregate" to_port="example set input"/>
+      <connect from_op="Aggregate" from_port="example set output" to_op="Sort (2)" to_port="example set input"/>
+      <connect from_op="Sort (2)" from_port="example set output" to_op="Generate Concatenation" to_port="example set input"/>
+      <connect from_op="Generate Concatenation" from_port="example set output" to_op="Aggregate (2)" to_port="example set input"/>
+      <connect from_op="Aggregate (2)" from_port="example set output" to_port="result 1"/>
+      <portSpacing port="source_input 1" spacing="0"/>
+      <portSpacing port="sink_result 1" spacing="0"/>
+      <portSpacing port="sink_result 2" spacing="0"/>
+    </process>
+  </operator>
+</process>