In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

# korean font settings
plt.rc("font", family="NanumGothic")
plt.rc("axes", unicode_minus=False)


# display settings

from IPython.display import set_matplotlib_formats
set_matplotlib_formats("retina")

# data settings
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
df = pd.read_csv("./data/total_data_2021-01-25.csv", encoding="cp949")


In [3]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Project Index,License Conflict,Component,Version,Home Page,License,Usage,Code Match,year,service,customer,project license,file
0,0,0,0,No Conflicts,airengine,Unspecified,http://code.google.com/p/airengine/,MIT License,Snippet,1,2013,KOSSA_Dev_Competition,KOSSA,[template] Basic Proprietary Commercial License,ossdev2013_15_NetScanV3Report.xlsx
1,1,1,1,No Conflicts,automitiveairmonitor,Unspecified,http://code.google.com/p/automitiveairmonitor/,MIT License,Snippet,2,2013,KOSSA_Dev_Competition,KOSSA,[template] Basic Proprietary Commercial License,ossdev2013_15_NetScanV3Report.xlsx
2,2,2,2,Declared License Conflict and Component Licens...,Network Sending Packet Generator,nspg 0.5.0,http://sourceforge.net/projects/nspg/,GNU General Public License v2.0 or later,Snippet,1,2013,KOSSA_Dev_Competition,KOSSA,[template] Basic Proprietary Commercial License,ossdev2013_15_NetScanV3Report.xlsx
3,3,3,3,No Conflicts,OpenPOWERLINK,Unspecified,http://sourceforge.net/projects/openpowerlink/,"BSD 3-clause ""New"" or ""Revised"" License",File,15,2013,KOSSA_Dev_Competition,KOSSA,[template] Basic Proprietary Commercial License,ossdev2013_15_NetScanV3Report.xlsx
4,4,4,4,No Conflicts,Original_General_Use,Unspecified,,Unspecified,Snippet (+ File),13,2013,KOSSA_Dev_Competition,KOSSA,[template] Basic Proprietary Commercial License,ossdev2013_15_NetScanV3Report.xlsx


In [4]:
len(df["customer"].unique())

475

# pre-processing

In [5]:
# License Conflict
df.loc[df["License Conflict"] =="프로젝트에  선언된 라이선스와 충돌", "License Conflict"] = "프로젝트에 선언된 라이선스와 충돌"
df.loc[df["License Conflict"] =="프로젝트 라이선스와 충돌", "License Conflict"] = "프로젝트에 선언된 라이선스와 충돌"
df.loc[df["License Conflict"] =="Declared License Conflict and Component License Conflict", "License Conflict"] = "프로젝트에 선언된 라이선스와 충돌"
df.loc[df["License Conflict"] =="Unknown", "License Conflict"] = "프로젝트에 선언된 라이선스와 충돌"
df.loc[df["License Conflict"] =="Component License Conflict", "License Conflict"] = "다른 컴포넌트 라이선스와 충돌"
df.loc[df["License Conflict"] =="No Conflict", "License Conflict"] = "충돌없음"
df.loc[df["License Conflict"] =="No Conflicts", "License Conflict"] = "충돌없음"

In [6]:
# Component
df["Component"] = df["Component"].str.strip()
df.loc[df["Component"].isnull() == True, "Component"] = "Native Abstractions for Node.js"

df.loc[df["Component"] =="jQuery", "Component"] = "jquery"
df.loc[df["Component"] =="jQuery UI - jquery/jquery-ui on GitHub", "Component"] = "jquery"
df.loc[df["Component"] =="jquery - jquery/jquery", "Component"] = "jquery"

df.loc[df["Component"] =="ezmorph", "Component"] = "EZMorph"
df.loc[df["Component"].str.contains("Bootstrap|Bootstrap - org.webjars:bootstrap|components - bootstra"), "Component"] = "bootstrap"

df.loc[df["Component"] =="mybatis", "Component"] = "MyBatis"
df.loc[df["Component"].str.contains("joda-time|Joda - Time - joda-time"), "Component"] = "Joda-Time"


df.loc[df["Component"] =="Apache Log4j - log4j:log4j", "Component"] = "Apache Log4j"

df.loc[df["Component"] =="mybatis-spring", "Component"] = "MyBatis-Spring"
df.loc[df["Component"].str.contains("jstree|jstree - org.webjars:jstree"), "Component"] = "jsTree"
df.loc[df["Component"].str.contains("JSch|jsch"), "Component"] = "JSCH"
df.loc[df["Component"] =="jdom", "Component"] = "JDOM"
df.loc[df["Component"] =="jackson-core", "Component"] = "Jackson-core"
df.loc[df["Component"] =="okhttp", "Component"] = "OkHttp"
df.loc[df["Component"].str.contains("font-awesome|Font-Awesome-font|font-awesome-5-css|font-awesome-svg-png|font-awesome.css|Font Awesome - org.webjars:font-awesome|font-awesome-css"), "Component"] = "Font-Awesome"
df.loc[df["Component"] =="Json-lib", "Component"] = "json-lib"
df.loc[df["Component"].str.contains("objenesis|Objenesis - org.objenesis:objenesis"), "Component"] = "Objenesis"
df.loc[df["Component"].str.contains("PostgreSQL JDBC Driver - JDBC 4.2|PostgreSQL JDBC Driver (pgjdbc)|Postgresql JDBC Driver"), "Component"] = "PostgreSQL JDBC Driver"
df.loc[df["Component"] =="mockito", "Component"] = "Mockito"
df.loc[df["Component"].str.contains("JASYPT: Java Simplified Encryption - org.jasypt:jasypt-spring31|JASYPT: Java Simplified Encryption - org.jasypt:jasypt-springsecurity3|jasypt: java simplified encryption|JASYPT: Java Simplified Encryption - org.jasypt:jasypt-spring3"), "Component"] = "JASYPT: Java Simplified Encryption"
df.loc[df["Component"] =="swiper", "Component"] = "Swiper"

df.loc[df["Component"] =="jackson-annotations", "Component"] = "Jackson-annotations"

df.loc[df["Component"] =="junit", "Component"] = "JUnit"
df.loc[df["Component"] =="chart.js", "Component"] = "Chart.js"
df.loc[df["Component"].str.contains("ziparchive (code.google.com/p/ziparchive)|ziparchive"), "Component"] = "ZipArchive"
df.loc[df["Component"] =="Antlr 3 Runtime", "Component"] = "ANTLR 3 Runtime"




In [7]:
df.loc[df["Component"].str.contains("unspecified|No Label"), "Component"] = "Unspecified"

In [8]:
# dual license
df.loc[df["License"] =="GPL 2.0", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="LGPL 2.1", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="MIT License V2", "License"] = "MIT License"
df.loc[df["License"] =="Eclipse Public License - v 1.0Alternate License:Common Public License", "License"] = "Eclipse Public License 1.0"
df.loc[df["License"] =="Mozilla Public License 1.1Alternate License:Zimbra Public EULA 2.1 License", "License"] = "Mozilla Public License 1.1"
df.loc[df["License"] =="MIT LicenseAlternate License:GNU Lesser General Public License v2.1 or later", "License"] = "MIT License"
df.loc[df["License"] =="Academic Free License v2.1Alternate License:GNU General Public License v2.0 or later", "License"] = "Academic Free License v2.1"
df.loc[df["License"] =="Free Type Project LicenseAlternate License:GPL 2.0", "License"] = "Free Type Project License"
df.loc[df["License"] =="Microsoft Public LicenseAlternate License:Apache License 2.0", "License"] = "Apache License 2.0"
df.loc[df["License"] =="GNU Lesser General Public License v3.0 or laterAlternate License:GNU General Public License v2.0 or later", "License"] = "GNU General Public License v2.0 or later"

df.loc[df["License"] =="GNU General Public License v2.0 or laterAlternate Licenses:Genivia gSOAP Commercial License, gSoap Public License Version 1.3a License", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="ANTLR LicenseAlternate License:Eclipse Public License - v 1.0", "License"] = "ANTLR License"
df.loc[df["License"] =="BSD 2.0Alternate License:GPL 2.0", "License"] = "BSD 2.0"
df.loc[df["License"] =="GPL 2.0Alternate License:BSD 2.0", "License"] = "BSD 2.0"
df.loc[df["License"] =="GPL 2.0Alternate License:GPL 2.0 Only", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="JavaMail 1.3.3 LicenseAlternate Licenses:JAVA Research License 1.5, Common Development and Distribution License 1.0", "License"] = "Common Development and Distribution License 1.0"
df.loc[df["License"] =="GNU Lesser General Public License v2.1 or laterAlternate License:Sun Public License v1.0", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="Common Development and Distribution License 1.0Alternate Licenses:Common Development and Distribution License 1.1, Sun GPL With Classpath Exception (GPL+)", "License"] = "Common Development and Distribution License 1.0"
df.loc[df["License"] =="Sun BSD LicenseAlternate License:GNU Lesser General Public License v2.1 or later", "License"] = "Sun BSD License"

df.loc[df["License"] =="PHP  License Version 2.02Alternate License:BSD 2-clause \"Simplified\" License", "License"] = "BSD 2-clause Simplified License"
df.loc[df["License"] =="GNU Lesser General Public License v2.1 or laterAlternate License:Apache License 2.0", "License"] = "Apache License 2.0"
df.loc[df["License"] =="GNU General Public License v3.0 or laterAlternate Licenses:GNU General Public License v2.0 or later, GNU Lesser General Public License v2.1 or later, Public Domain", "License"] = "Public Domain"
df.loc[df["License"] =="LGPL 2.1Alternate License:GPL 2.0", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="MIT License V2Alternate License:LGPL 2.1", "License"] = "MIT License"
df.loc[df["License"] =="GNU Lesser General Public License v3.0 or laterAlternate License:BSD 3-clause \"New\" or \"Revised\" License", "License"] = "BSD 3-clause New or Revised License"
df.loc[df["License"] =="Public DomainAlternate License:MIT License", "License"] = "MIT License"
df.loc[df["License"] =="MIT License V2Alternate License:MIT v2 with Ad Clause License", "License"] = "MIT License"
df.loc[df["License"] =="Public DomainAlternate License:MIT License", "License"] = "MIT License"

df.loc[df["License"] =="W3C Software Notice and LicenseAlternate License:W3C Software 20021231 License", "License"] = "W3C Software Notice and License"
df.loc[df["License"] =="CodeIgniter LicenseAlternate License:Unspecified", "License"] = "CodeIgniter License"
df.loc[df["License"] =="GNU Lesser General Public License v2.1 or laterAlternate Licenses:GNU General Public License v3.0 or later, MIT License", "License"] = "MIT License"
df.loc[df["License"] =="LGPL 2.1Alternate License:LGPL 3.0", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="Creative Commons Attribution 3.0Alternate License:MIT License", "License"] = "MIT License"
df.loc[df["License"] =="GNU General Public License v2.0 or laterAlternate License:Mozilla Public License 2.0", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="MIT LicenseAlternate License:Unspecified", "License"] = "MIT License"
df.loc[df["License"] =="Apache License 2.0Alternate License:Unspecified", "License"] = "Apache License 2.0"
df.loc[df["License"] =="MIT LicenseAlternate License:GNU General Public License v2.0 or later", "License"] = "MIT License"

df.loc[df["License"] =="Common Development and Distribution License 1.0Alternate License:Sun GPL With Classpath Exception (GPL+)", "License"] = "Common Development and Distribution License 1.0"
df.loc[df["License"] =="Common Development and Distribution License 1.1Alternate License:Sun GPL With Classpath Exception (GPL+)", "License"] = "Common Development and Distribution License 1.1"
df.loc[df["License"] =="Apple DisclaimerAlternate License:Unspecified", "License"] = "Apple Disclaimer"
df.loc[df["License"] =="GNU Lesser General Public License v2.1 or laterAlternate License:Mozilla Public License 1.1", "License"] = "Mozilla Public License 1.1"
df.loc[df["License"] =="GNU Lesser General Public License v2.1 or laterAlternate License:GNU General Public License v2.0 or later", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="BSD 3-clause \"New\" or \"Revised\" LicenseAlternate License:Unspecified", "License"] = "BSD 3-clause New or Revised License"
df.loc[df["License"] =="Sun JavaBeans Activation Framework 1.1 License for non-redistributablesAlternate License:Sun JavaBeans Activation Framework 1.1 License for Redistributable Code", "License"] = "Sun JavaBeans Activation Framework 1.1 License for Redistributable Code"
df.loc[df["License"] ==" MIT License", "License"] = "MIT License"



df.loc[df["License"] =="GNU Lesser General Public License v2.1 or laterAlternate Licenses:GNU General Public License v2.0 or later, Mozilla Public License 1.1", "License"] = "Mozilla Public License 1.1"
df.loc[df["License"] =="Apache License 2.0Alternate License:GNU Lesser General Public License v2.1 or later", "License"] = "Apache License 2.0"
df.loc[df["License"] =="MIT License V2Alternate License:GPL 2.0", "License"] = "MIT License"
df.loc[df["License"] =="GNU General Public License v2.0 or laterAlternate License:GNU Lesser General Public License v2.1 or later", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="Apache License 2.0Alternate License:Apache License 1.1", "License"] = "Apache License 1.1"
df.loc[df["License"] =="Artistic License 1.0Alternate License:GNU General Public License v2.0 or later", "License"] = "Artistic License 1.0"
df.loc[df["License"] =="Mozilla Public License 1.1Alternate License:GNU Lesser General Public License v2.1 or later", "License"] = "Mozilla Public License 1.1"
df.loc[df["License"] =="MIT License V2Alternate License:Unspecified", "License"] = "MIT License"
df.loc[df["License"] =="GNU General Public License v2.0 or laterAlternate License:MySQL Commercial License", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="GNU Lesser General Public License v2.1 or laterAlternate License:Unspecified", "License"] = "GNU Lesser General Public License v2.1 or later"

df.loc[df["License"] =="Sun JavaBeans Activation Framework 1.1.1 License for non-redistributablesAlternate License:Sun JavaBeans Activation Framework 1.1.1 License for Redistributable Code", "License"] = "Sun JavaBeans Activation Framework 1.1.1 License for Redistributable Code"
df.loc[df["License"] =="Apache License Version 2.0Alternate License:Unspecified", "License"] = "Apache License 2.0"
df.loc[df["License"] =="GNU General Public License v2.0 or laterAlternate License:Unspecified", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="MIT LicenseAlternate License:BSD 3-clause \"New\" or \"Revised\" License", "License"] = "MIT License"
df.loc[df["License"] =="GNU General Public License v2.0 or laterAlternate License:MIT License", "License"] = "MIT License"
df.loc[df["License"] =="Apache License 2.0Alternate License:University of Washington's Free-Fork License", "License"] = "Apache License 2.0"
df.loc[df["License"] =="LGPL 2.0Alternate License:GPL 2.0", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="Mozilla Public License 1.1Alternate Licenses:GNU General Public License v2.0 or later, GNU Lesser General Public License v2.1 or later", "License"] = "Mozilla Public License 1.1"
df.loc[df["License"] =="Oswego.util.concurrent LicenseAlternate License:Doug Lea Sun License", "License"] = "Doug Lea Sun License"

df.loc[df["License"] =="Common Development and Distribution License 1.0Alternate License:GNU General Public License v2.0 or later", "License"] = "Common Development and Distribution License 1.0"
df.loc[df["License"] =="Apache License 2.0Alternate License:GNU General Public License v2.0 or later", "License"] = "Apache License 2.0"
df.loc[df["License"] =="BSD 3-clause \"New\" or \"Revised\" LicenseAlternate License:BSD 2-clause \"Simplified\" License", "License"] = "BSD 3-clause New or Revised License"
df.loc[df["License"] =="Proprietary LicenseAlternate Licenses:Creative Commons Attribution Non Commercial 3.0, Alternative Commercial License Available", "License"] = "Creative Commons Attribution Non Commercial 3.0"
df.loc[df["License"] =="Common Development and Distribution License 1.1Alternate License:Apache License 2.0", "License"] = "Apache License 2.0"
df.loc[df["License"] =="Less LicenseAlternate License:GNU General Public License v2.0 or later", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="Common Development and Distribution License 1.0Alternate License:OpenSolaris Binary License", "License"] = "Common Development and Distribution License 1.0"
df.loc[df["License"] =="UnspecifiedAlternate License:ISC License", "License"] = "ISC License"
df.loc[df["License"] =="MIT LicenseAlternate License:Apache License 2.0", "License"] = "MIT License"

df.loc[df["License"] =="GPL 2.0Alternate License:MIT License V2", "License"] = "MIT License"
df.loc[df["License"] =="Common Development and Distribution License 1.1Alternate License:Sun Community Source License 3.0", "License"] = "Common Development and Distribution License 1.1"
df.loc[df["License"] =="LGPL 3.0Alternate License:LGPL 2.1", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="Apache License Version 2.0Alternate License:GPL 3.0", "License"] = "Apache License 2.0"
df.loc[df["License"] =="Apache License 2.0Alternate License:GNU General Public License v3.0 or later", "License"] = "Apache License 2.0"
df.loc[df["License"] =="COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0Alternate License:Sun JavaMail 1.4 License", "License"] = "Common Development and Distribution License 1.0"
df.loc[df["License"] =="Sun JavaBeans Activation Framework 1.1 License for Redistributable CodeAlternate License:Sun JavaBeans Activation Framework 1.1 License for non-redistributables", "License"] = "Sun JavaBeans Activation Framework 1.1 License for Redistributable Code"
df.loc[df["License"] =="GNU Lesser General Public License v3.0 or laterAlternate Licenses:GNU General Public License v3.0 or later, MIT License", "License"] = "MIT License"
df.loc[df["License"] =="GNU General Public License v3.0 or laterAlternate License:Ext Exception to GPL 3.0 for Applications", "License"] = "GNU General Public License v3.0 or later"

df.loc[df["License"] =="Historic Permission Notice and DisclaimerAlternate License:CNRI Python License", "License"] = "CNRI Python License"
df.loc[df["License"] =="GNU Lesser General Public License v3.0 or laterAlternate License:GNU Lesser General Public License v2.1 or later", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="MIT LicenseAlternate License:Apache License 2.0", "License"] = "MIT License"
df.loc[df["License"] =="GNU General Public License v1.0 or laterAlternate License:Artistic License 1.0", "License"] = "Artistic License 1.0"
df.loc[df["License"] =="GNU General Public License v3.0 or laterAlternate License:GNU Lesser General Public License v2.1 or later", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="MIT LicenseAlternate License:cocos2d for iPhone License", "License"] = "MIT License"
df.loc[df["License"] =="BSD 2-clause \"Simplified\" LicenseAlternate License:Unspecified", "License"] = "BSD 2-clause Simplified License"
df.loc[df["License"] =="GNU Library General Public License v2 or laterAlternate License:Unspecified", "License"] = "GNU Library General Public License v2.0 or later"
df.loc[df["License"] =="Apache License 2.0Alternate License:Common Development and Distribution License 1.0", "License"] = "Apache License 2.0"

df.loc[df["License"] =="Apache License 2.0Alternate License:Common Development and Distribution License 1.1", "License"] = "Apache License 2.0"
df.loc[df["License"] =="Google Android SDK LicenseAlternate License:Google Android SDK 4-2009 License", "License"] = "Google Android SDK License"
df.loc[df["License"] =="GPL 3.0Alternate License:Unspecified", "License"] = "GNU General Public License v3.0 or later"
df.loc[df["License"] =="Creative Commons Attribution-NonCommercial-ShareAlike 3.0 LicenseAlternate License:Unspecified", "License"] = "Creative Commons Attribution-NonCommercial-ShareAlike 3.0 License"
df.loc[df["License"] =="zlib LicenseAlternate License:Unspecified", "License"] = "zlib License"
df.loc[df["License"] =="BSD 3-clause \"New\" or \"Revised\" LicenseAlternate License:ASM License", "License"] = "BSD 3-clause New or Revised License"
df.loc[df["License"] =="PostgreSQL LicenseAlternate License:Unspecified", "License"] = "PostgreSQL License"
df.loc[df["License"] =="MIT License V2Alternate License:ICU Licensed", "License"] = "MIT License"
df.loc[df["License"] =="Oracle Technology Network Development and Distribution LicenseAlternate License:Oracle Technology Network Development and Distribution 01-2008 License", "License"] = "Oracle Technology Network Development and Distribution License"

df.loc[df["License"] =="GNU General Public License v2.0 or laterAlternate License:Intel Binary Program License", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="Common Development and Distribution License 1.1Alternate License:GNU General Public License v3.0 or later", "License"] = "Common Development and Distribution License 1.1"
df.loc[df["License"] =="LGPL 2.1Alternate License:zlib/libpng License", "License"] = "zlib License"
df.loc[df["License"] =="BSD 2.0Alternate License:Unspecified", "License"] = "BSD 2-clause Simplified License"
df.loc[df["License"] =="Apache License 2.0Alternate Licenses:GNU General Public License v2.0 or later, Artistic License 1.0", "License"] = "Apache License 2.0"
df.loc[df["License"] =="GNU Library General Public License v2 or laterAlternate License:GNU General Public License v2.0 or later", "License"] = "GNU Library General Public License v2.0 or later"
df.loc[df["License"] =="PHP  License Version 2.01Alternate License:PHP LIcense v3.01", "License"] = "PHP License 2.01"
df.loc[df["License"] =="Common Development and Distribution License 1.0Alternate Licenses:GNU General Public License v2.0 w/Classpath exception, Common Development and Distribution License 1.1", "License"] = "Common Development and Distribution License 1.0"
df.loc[df["License"] =="Creative Commons Attribution Non Commercial Share Alike 2.5Alternate License:GNU General Public License v3.0 or later", "License"] = "GNU General Public License v3.0 or later"




df.loc[df["License"] =="Microsoft JDBC Driver 3.0 for SQL Server LicenseAlternate License:Microsoft JDBC Driver 4.0 for SQL Server CTP Pre-Release License", "License"] = "Microsoft JDBC Driver 3.0 for SQL Server License"
df.loc[df["License"] =="GNU Lesser General Public License v2.1 or laterAlternate License:GNU General Public License v3.0 or later", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="Google Android SDK LicenseAlternate License:Unspecified", "License"] = "Google Android SDK License"
df.loc[df["License"] =="Apache License 2.0Alternate License:Libjpeg License (JPEG License)", "License"] = "Apache License 2.0"
df.loc[df["License"] =="BSD 2.0Alternate License:PCRE 5 License", "License"] = "BSD 2-clause Simplified License"
df.loc[df["License"] =="BSD 2-clause \"Simplified\" LicenseAlternate License:BSD 3-clause \"New\" or \"Revised\" License", "License"] = "BSD 2-clause Simplified License"
df.loc[df["License"] =="Mozilla Public License 1.1Alternate Licenses:LGPL 2.1, Apache License Version 2.0", "License"] = "Apache License 2.0"
df.loc[df["License"] =="Mozilla Public License 1.1Alternate Licenses:LGPL 2.1, Apache License Version 2.0", "License"] = "Apache License 2.0"
df.loc[df["License"] =="GNU Lesser General Public License v2.1 or laterAlternate License:MIT License", "License"] = "MIT License"

df.loc[df["License"] =="GNU General Public License v2.0 or laterAlternate License:Alternative Commercial License Available", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="Apache License 2.0Alternate License:Open Symphony 1.1 License ", "License"] = "Apache License 2.0"
df.loc[df["License"] =="MIT License V2Alternate License:Unspecified", "License"] = "MIT License"
df.loc[df["License"] =="GNU General Public License v2.0 or laterAlternate License:MySQL Commercial License", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="GNU Lesser General Public License v2.1 or laterAlternate License:Unspecified", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="Sun JavaBeans Activation Framework 1.1.1 License for non-redistributablesAlternate License:Sun JavaBeans Activation Framework 1.1.1 License for Redistributable Code", "License"] = "Sun JavaBeans Activation Framework 1.1.1 License for Redistributable Code"
df.loc[df["License"] =="Apache License Version 2.0Alternate License:Unspecified", "License"] = "Apache License 2.0"
df.loc[df["License"] =="GNU General Public License v2.0 or laterAlternate License:Unspecified", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="MIT LicenseAlternate License:BSD 3-clause \"New\" or \"Revised\" License", "License"] = "MIT License"
df.loc[df["License"] =="GNU General Public License v2.0 or laterAlternate License:MIT License", "License"] = "MIT License"
df.loc[df["License"] =="Apache License 2.0Alternate License:University of Washington's Free-Fork License", "License"] = "Apache License 2.0"

df.loc[df["License"] =="LGPL 2.0Alternate License:GPL 2.0", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="Mozilla Public License 1.1Alternate Licenses:GNU General Public License v2.0 or later, GNU Lesser General Public License v2.1 or later", "License"] = "Mozilla Public License 1.1"
df.loc[df["License"] =="Oswego.util.concurrent LicenseAlternate License:Doug Lea Sun License", "License"] = "Doug Lea Sun License"
df.loc[df["License"] =="Common Development and Distribution License 1.0Alternate License:GNU General Public License v2.0 or later", "License"] = "Common Development and Distribution License 1.0"
df.loc[df["License"] =="Apache License 2.0Alternate License:GNU General Public License v2.0 or later", "License"] = "Apache License 2.0"
df.loc[df["License"] =="BSD 3-clause \"New\" or \"Revised\" LicenseAlternate License:BSD 2-clause \"Simplified\" License", "License"] = "BSD 3-clause New or Revised License"
df.loc[df["License"] =="Proprietary LicenseAlternate Licenses:Creative Commons Attribution Non Commercial 3.0, Alternative Commercial License Available", "License"] = "Creative Commons Attribution Non Commercial 3.0"
df.loc[df["License"] =="Common Development and Distribution License 1.1Alternate License:Apache License 2.0", "License"] = "Apache License 2.0"
df.loc[df["License"] =="Less LicenseAlternate License:GNU General Public License v2.0 or later", "License"] = "GNU General Public License v2.0 or later"

df.loc[df["License"] =="Common Development and Distribution License 1.0Alternate License:OpenSolaris Binary License", "License"] = "Common Development and Distribution License 1.0"
df.loc[df["License"] =="UnspecifiedAlternate License:ISC License", "License"] = "ISC License"
df.loc[df["License"] =="MIT LicenseAlternate License:Apache License 2.0", "License"] = "MIT License"
df.loc[df["License"] =="GPL 2.0Alternate License:MIT License V2", "License"] = "MIT License"
df.loc[df["License"] =="Common Development and Distribution License 1.1Alternate License:Sun Community Source License 3.0", "License"] = "Common Development and Distribution License 1.1"
df.loc[df["License"] =="LGPL 3.0Alternate License:LGPL 2.1", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="Apache License Version 2.0Alternate License:GPL 3.0", "License"] = "Apache License 2.0"
df.loc[df["License"] =="Apache License 2.0Alternate License:GNU General Public License v3.0 or later", "License"] = "Apache License 2.0"
df.loc[df["License"] =="COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0Alternate License:Sun JavaMail 1.4 License", "License"] = "Common Development and Distribution License 1.0"
df.loc[df["License"] =="Sun JavaBeans Activation Framework 1.1 License for Redistributable CodeAlternate License:Sun JavaBeans Activation Framework 1.1 License for non-redistributables", "License"] = "Sun JavaBeans Activation Framework 1.1 License for Redistributable Code"

df.loc[df["License"] =="GNU Lesser General Public License v3.0 or laterAlternate Licenses:GNU General Public License v3.0 or later, MIT License", "License"] = "MIT License"
df.loc[df["License"] =="GNU General Public License v3.0 or laterAlternate License:Ext Exception to GPL 3.0 for Applications", "License"] = "GNU General Public License v3.0 or later"
df.loc[df["License"] =="Historic Permission Notice and DisclaimerAlternate License:CNRI Python License", "License"] = "CNRI Python License"
df.loc[df["License"] =="GNU Lesser General Public License v3.0 or laterAlternate License:GNU Lesser General Public License v2.1 or later", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="MIT LicenseAlternate License:Apache License 2.0", "License"] = "MIT License"
df.loc[df["License"] =="GNU General Public License v1.0 or laterAlternate License:Artistic License 1.0", "License"] = "Artistic License 1.0"
df.loc[df["License"] =="GNU General Public License v3.0 or laterAlternate License:GNU Lesser General Public License v2.1 or later", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="MIT LicenseAlternate License:cocos2d for iPhone License", "License"] = "MIT License"

df.loc[df["License"] =="BSD 2-clause \"Simplified\" LicenseAlternate License:Unspecified", "License"] = "BSD 2-clause Simplified License"
df.loc[df["License"] =="GNU Library General Public License v2 or laterAlternate License:Unspecified", "License"] = "GNU Library General Public License v2.0 or later"
df.loc[df["License"] =="Apache License 2.0Alternate License:Common Development and Distribution License 1.0", "License"] = "Apache License 2.0"
df.loc[df["License"] =="Apache License 2.0Alternate License:Common Development and Distribution License 1.1", "License"] = "Apache License 2.0"
df.loc[df["License"] =="Google Android SDK LicenseAlternate License:Google Android SDK 4-2009 License", "License"] = "Google Android SDK License"
df.loc[df["License"] =="GPL 3.0Alternate License:Unspecified", "License"] = "GNU General Public License v3.0 or later"
df.loc[df["License"] =="Creative Commons Attribution-NonCommercial-ShareAlike 3.0 LicenseAlternate License:Unspecified", "License"] = "Creative Commons Attribution-NonCommercial-ShareAlike 3.0 License"
df.loc[df["License"] =="zlib LicenseAlternate License:Unspecified", "License"] = "zlib License"
df.loc[df["License"] =="BSD 3-clause \"New\" or \"Revised\" LicenseAlternate License:ASM License", "License"] = "BSD 3-clause New or Revised License"
df.loc[df["License"] =="PostgreSQL LicenseAlternate License:Unspecified", "License"] = "PostgreSQL License"

df.loc[df["License"] =="MIT License V2Alternate License:ICU Licensed", "License"] = "MIT License"
df.loc[df["License"] =="Oracle Technology Network Development and Distribution LicenseAlternate License:Oracle Technology Network Development and Distribution 01-2008 License", "License"] = "Oracle Technology Network Development and Distribution License"
df.loc[df["License"] =="GNU General Public License v2.0 or laterAlternate License:Intel Binary Program License", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="Common Development and Distribution License 1.1Alternate License:GNU General Public License v3.0 or later", "License"] = "Common Development and Distribution License 1.1"
df.loc[df["License"] =="LGPL 2.1Alternate License:zlib/libpng License", "License"] = "zlib License"
df.loc[df["License"] =="BSD 2.0Alternate License:Unspecified", "License"] = "BSD 2-clause Simplified License"
df.loc[df["License"] =="Apache License 2.0Alternate Licenses:GNU General Public License v2.0 or later, Artistic License 1.0", "License"] = "Apache License 2.0"
df.loc[df["License"] =="GNU Library General Public License v2 or laterAlternate License:GNU General Public License v2.0 or later", "License"] = "GNU Library General Public License v2.0 or later"
df.loc[df["License"] =="PHP  License Version 2.01Alternate License:PHP LIcense v3.01", "License"] = "PHP License 2.01"

df.loc[df["License"] =="Common Development and Distribution License 1.0Alternate Licenses:GNU General Public License v2.0 w/Classpath exception, Common Development and Distribution License 1.1", "License"] = "Common Development and Distribution License 1.0"
df.loc[df["License"] =="Creative Commons Attribution Non Commercial Share Alike 2.5Alternate License:GNU General Public License v3.0 or later", "License"] = "GNU General Public License v3.0 or later"
df.loc[df["License"] =="Microsoft JDBC Driver 3.0 for SQL Server LicenseAlternate License:Microsoft JDBC Driver 4.0 for SQL Server CTP Pre-Release License", "License"] = "Microsoft JDBC Driver 3.0 for SQL Server License"
df.loc[df["License"] =="GNU Lesser General Public License v2.1 or laterAlternate License:GNU General Public License v3.0 or later", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="Google Android SDK LicenseAlternate License:Unspecified", "License"] = "Google Android SDK License"
df.loc[df["License"] =="Apache License 2.0Alternate License:Libjpeg License (JPEG License)", "License"] = "Apache License 2.0"
df.loc[df["License"] =="BSD 2.0Alternate License:PCRE 5 License", "License"] = "BSD 2-clause Simplified License"
df.loc[df["License"] =="BSD 2-clause \"Simplified\" LicenseAlternate License:BSD 3-clause \"New\" or \"Revised\" License", "License"] = "BSD 2-clause Simplified License"
df.loc[df["License"] =="Mozilla Public License 1.1Alternate Licenses:LGPL 2.1, Apache License Version 2.0", "License"] = "Apache License 2.0"
df.loc[df["License"] =="Mozilla Public License 1.1Alternate Licenses:LGPL 2.1, Apache License Version 2.0", "License"] = "Apache License 2.0"

df.loc[df["License"] =="GNU Lesser General Public License v2.1 or laterAlternate License:MIT License", "License"] = "MIT License"
df.loc[df["License"] =="GNU General Public License v2.0 or laterAlternate License:Alternative Commercial License Available", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="Apache License 2.0Alternate License:Open Symphony 1.1 License ", "License"] = "Apache License 2.0"
df.loc[df["License"] =="GNU Lesser General Public License v2.1 or laterAlternate License:Eclipse Public License 1.0", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="GNU General Public License v3.0 or laterAlternate License:Unspecified", "License"] = "GNU General Public License v3.0 or later"
df.loc[df["License"] =="Free Type Project LicenseAlternate License:GNU General Public License v2.0 or later", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="zlib LicenseAlternate License:GNU Lesser General Public License v2.1 or later", "License"] = "zlib License"
df.loc[df["License"] =="GPL 2.0Alternate Licenses:Genivia gSOAP Commercial License, gSOAP Public 1.3b License", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="Open Software License 3.0Alternate License:GNU General Public License v3.0 or later", "License"] = "GNU General Public License v3.0 or later"
df.loc[df["License"] =="ICU LicenseAlternate License:Unspecified", "License"] = "ICU License"
df.loc[df["License"] =="BSD 2.0Alternate License:Global IP Sound 2.0 License", "License"] = "BSD 2-clause Simplified License"

df.loc[df["License"] =="Creative Commons Attribution 3.0Alternate License:Unspecified", "License"] = "Creative Commons Attribution 3.0"
df.loc[df["License"] =="GNU General Public License v2.0 or laterAlternate License:Q Public License 1.0", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="MIT License V2Alternate License:libxml2 License", "License"] = "MIT License"
df.loc[df["License"] =="MIT License V2Alternate License:ICU License", "License"] = "MIT License"
df.loc[df["License"] =="PHP License v3.0Alternate License:PHP LIcense v3.01", "License"] = "PHP License v3.0"
df.loc[df["License"] =="Apache License 2.0Alternate License:Open Symphony 1.1 License", "License"] = "Apache License 2.0"
df.loc[df["License"] =="BSD 3-clause \"New\" or \"Revised\" LicenseAlternate License:MIT License", "License"] = "MIT License"
df.loc[df["License"] =="ICU LicenseAlternate License:Unspecified", "License"] = "ICU License"
df.loc[df["License"] =="Apache License 2.0Alternate Licenses:Mozilla Public License 1.1, GNU Lesser General Public License v2.1 or later", "License"] = "Apache License 2.0"

df.loc[df["License"] =="Artistic License 1.0Alternate License:Unspecified", "License"] = "Artistic License 1.0"
df.loc[df["License"] =="NASA Open Source Agreement 1.3Alternate License:Unspecified", "License"] = "NASA Open Source Agreement 1.3"
df.loc[df["License"] =="Sun GPL With Classpath Exception (GPL+)Alternate License:Common Development and Distribution License 1.1", "License"] = "Common Development and Distribution License 1.1"
df.loc[df["License"] =="BSD 3-clause \"New\" or \"Revised\" LicenseAlternate License:Apache License 2.0", "License"] = "Apache License 2.0"
df.loc[df["License"] =="GPL 2.0Alternate License:LGPL 2.1", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="Apache License 2.0Alternate License:BSD 3-clause \"New\" or \"Revised\" License", "License"] = "BSD 3-clause New or Revised License"
df.loc[df["License"] =="LGPL 2.1Alternate License:Unspecified", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="Public DomainAlternate License:Unspecified", "License"] = "Public Domain"
df.loc[df["License"] =="Mozilla Public License 1.1Alternate License:GNU General Public License v2.0 or later", "License"] = "Mozilla Public License 1.1"
df.loc[df["License"] =="Apache License 2.0Alternate License:Eclipse Public License 1.0", "License"] = "Apache License 2.0"

df.loc[df["License"] =="Code Project Open License Alternate License:Unspecified", "License"] = "Code Project Open License"
df.loc[df["License"] =="MIT LicenseAlternate License:GNU General Public License v3.0 or later", "License"] = "MIT License"
df.loc[df["License"] =="Proprietary LicenseAlternate License:[template] Basic Proprietary Commercial License", "License"] = "Proprietary License"
df.loc[df["License"] =="Proprietary LicenseAlternate License:Unspecified", "License"] = "Proprietary License"
df.loc[df["License"] =="Common Development and Distribution License 1.1Alternate License:Unspecified", "License"] = "Common Development and Distribution License 1.1"
df.loc[df["License"] =="GNU General Public License v2.0 or laterAlternate License:Artistic License 1.0", "License"] = "Artistic License 1.0"
df.loc[df["License"] =="Apache License Version 2.0Alternate License:Apache 1.1", "License"] = "Apache License 2.0"
df.loc[df["License"] =="Common Development and Distribution License 1.0Alternate License:GNU Lesser General Public License v2.1 or later", "License"] = "Common Development and Distribution License 1.0"
df.loc[df["License"] =="GPL 2.0Alternate Licenses:Genivia gSOAP Commercial License, gSoap Public License Version 1.3a License", "License"] = "MIT License"
df.loc[df["License"] =="MIT License V2Alternate License:libxml2 License", "License"] = "GNU General Public License v2.0 or later"

df.loc[df["License"] =="Java Advanced Imaging Distribution License (JDL) 1.1.xAlternate License:JAVA Research License 1.5", "License"] = "JAVA Research License 1.5"
df.loc[df["License"] =="GNU General Public License v2.0 or laterAlternate License:GNU General Public License v2.0 only", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="Common Development and Distribution License 1.0Alternate License:Sun JavaMail 1.4 License", "License"] = "Common Development and Distribution License 1.0"
df.loc[df["License"] =="Common Development and Distribution License 1.0Alternate License:Unspecified", "License"] = "Common Development and Distribution License 1.0"
df.loc[df["License"] =="Academic Free License v2.1Alternate License:BSD 3-clause \"New\" or \"Revised\" License", "License"] = "BSD 3-clause New or Revised License"
df.loc[df["License"] =="Alternative Commercial License AvailableAlternate License:Creative Commons Attribution Non Commercial 3.0", "License"] = "Creative Commons Attribution Non Commercial 3.0"
df.loc[df["License"] =="Eclipse Public License - v 1.0Alternate License:Unspecified", "License"] = "Eclipse Public License 1.0"
df.loc[df["License"] =="GNU General Public License v3.0 or laterAlternate License:GNU General Public License v2.0 or later", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="MySQL Commercial LicenseAlternate License:GNU General Public License v2.0 or later", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="MIT License V2Alternate License:libxml2 License", "License"] = "MIT License"

df.loc[df["License"] =="Common Development and Distribution License 1.1Alternate License:GNU General Public License v2.0 w/Classpath exception", "License"] = "Common Development and Distribution License 1.1"
df.loc[df["License"] =="COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0Alternate License:Sun GPL With Classpath Exception (GPL+)", "License"] = "Common Development and Distribution License 1.0"
df.loc[df["License"] =="COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0Alternate License:Unspecified", "License"] = "Common Development and Distribution License 1.0"
df.loc[df["License"] =="Apache License 2.0Alternate License:MIT License", "License"] = "MIT License"
df.loc[df["License"] =="Apache License 1.1Alternate License:Apache License 2.0", "License"] = "Apache License 1.1"
df.loc[df["License"] =="Apache License Version 2.0Alternate License:LGPL 2.1", "License"] = "Apache License 2.0"
df.loc[df["License"] =="BSD 3-clause \"New\" or \"Revised\" LicenseAlternate License:GNU General Public License v2.0 or later", "License"] = "BSD 3-clause New or Revised License"
df.loc[df["License"] =="Perl Artistic LicenseAlternate License:GPL 2.0", "License"] = "Perl Artistic License"
df.loc[df["License"] =="GNU General Public License v2.0 or laterAlternate License:Sun GPL With Classpath Exception (GPL+)", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="Code Project Open LicenseAlternate License:Unspecified", "License"] = "Code Project Open License"

df.loc[df["License"] =="Mozilla Public License 1.1Alternate Licenses:GNU Lesser General Public License v2.1 or later, GNU General Public License v2.0 or later", "License"] = "Mozilla Public License 1.1"
df.loc[df["License"] =="BSD 3-clause \"New\" or \"Revised\" LicenseAlternate License:Academic Free License v2.1", "License"] = "BSD 3-clause New or Revised License"
df.loc[df["License"] =="GNU General Public License v2.0 or laterAlternate License:GNU General Public License v3.0 or later", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="Common Development and Distribution License 1.0Alternate Licenses:Sun GPL With Classpath Exception (GPL+), Common Development and Distribution License 1.1", "License"] = "Common Development and Distribution License 1.0"
df.loc[df["License"] =="MIT LicenseAlternate License:FusionCharts Free License", "License"] = "MIT License"
df.loc[df["License"] =="Mozilla Public License 1.1Alternate License:Unspecified", "License"] = "Mozilla Public License 1.1"
df.loc[df["License"] =="GNU Lesser General Public License v3.0 or laterAlternate License:Unspecified", "License"] = "GNU Lesser General Public License v3.0 or later"
df.loc[df["License"] =="UnspecifiedAlternate License:AmCharts Linkware License", "License"] = "AmCharts Linkware License"
df.loc[df["License"] =="Open Software License 2.1Alternate License:Unspecified", "License"] = "Open Software License 2.1"

df.loc[df["License"] =="GPL 2.0Alternate License:OpenVPN GPL 2 Only with Exception License", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="GPL 2.0Alternate License:GPL 3.0", "License"] = "GNU General Public License v2.0 or later"
df.loc[df["License"] =="GPL 3.0Alternate License:LGPL 2.1", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="Creative Commons Attribution 2.5Alternate License:Unspecified", "License"] = "Creative Commons Attribution 2.5"

df.loc[df["License"] =="CDDL-1.1 AND GPL-2.0-only WITH Classpath-exception-2.0", "License"] = "Common Development and Distribution License 1.1"
df.loc[df["License"] =="CDDL-1.0 AND GPL-2.0-only WITH Classpath-exception-2.0", "License"] = "Common Development and Distribution License 1.0"

In [9]:
# overlapped license name

df.loc[df["License"] =="BSD 3-clause \"New\" or \"Revised\" License", "License"] = "BSD 3-clause New or Revised License"
df.loc[df["License"] =="BSD 3-clause \'New\' or \'Revised\' License", "License"] = "BSD 3-clause New or Revised License"
df.loc[df["License"] =="BSD 3-Clause \'New\' or \'Revised\' License", "License"] = "BSD 3-clause New or Revised License"

df.loc[df["License"] =="BSD 3-Clause \"New\" or \"Revised\" License", "License"] = "BSD 3-clause New or Revised License"
df.loc[df["License"] ==" BSD 3-clause \"New\" or \"Revised\" License", "License"] = "BSD 3-clause New or Revised License"

df.loc[df["License"] =="BSD 3-Clause Clear License", "License"] = "BSD 3-clause Clear License"
df.loc[df["License"] =="BSD License", "License"] = "BSD 3-clause Clear License"


df.loc[df["License"] =="BSD", "License"] = "BSD 3-clause New or Revised License"
df.loc[df["License"] =="BSD-unspecified", "License"] = "BSD 3-clause New or Revised License"


df.loc[df["License"] =="BSD 2-clause \"Simplified\" License", "License"] = "BSD 2-clause Simplified License"
df.loc[df["License"] =="BSD 2-Clause \"Simplified\" License", "License"] = "BSD 2-clause Simplified License"
df.loc[df["License"] =="BSD 2-clause \'Simplified\' License", "License"] = "BSD 2-clause Simplified License"
df.loc[df["License"] ==" BSD 2-clause \"Simplified\" License", "License"] = "BSD 2-clause Simplified License"


df.loc[df["License"] =="BSD 2.0", "License"] = "BSD 2-clause Simplified License"

df.loc[df["License"] =="BSD 2-Clause NetBSD License", "License"] = "BSD 2-clause NetBSD License"


df.loc[df["License"] =="BSD 1.0", "License"] = "BSD 1-clause License"
df.loc[df["License"] =="BSD One Clause License", "License"] = "BSD 1-clause License"



df.loc[df["License"] =="BSD 4-clause \"Original\" or \"Old\" License", "License"] = "BSD 4-clause Original or Old License"
df.loc[df["License"] =="BSD 4-clause 'Original' or 'Old' License", "License"] = "BSD 4-clause Original or Old License"


df.loc[df["License"] =="GPL", "License"] = "GNU Lesser General Public License v2.0 or later"
df.loc[df["License"] ==" GNU General Public License v2.0 or later", "License"] = "GNU Lesser General Public License v2.0 or later"


df.loc[df["License"] =="GPL 2.0 Only", "License"] = "GNU General Public License v2.0 only"
df.loc[df["License"] =="GNU Library General Public License v2 or later", "License"] = "GNU Library General Public License v2.0 or later"
df.loc[df["License"] ==" GNU Lesser General Public License v2.1 or later", "License"] = "GNU Lesser General Public License v2.1 or later"
df.loc[df["License"] =="LGPL 2.0", "License"] = "GNU Lesser General Public License v2.1 or later"

df.loc[df["License"] ==" Sun GPL With Classpath Exception v2.0", "License"] = "Sun GPL With Classpath Exception v2.0"
df.loc[df["License"] ==" GNU General Public License v2.0 w/Classpath exception", "License"] = "GNU General Public License v2.0 w/Classpath exception"

df.loc[df["License"] ==" Sun JavaBeans Activation Framework 1.1 License for non-redistributables", "License"] = "Sun JavaBeans Activation Framework 1.1 License for non-redistributables"



df.loc[df["License"] =="GPL 3.0", "License"] = "GNU General Public License v3.0 or later"
df.loc[df["License"] =="LGPL 3.0", "License"] = "GNU Lesser General Public License v3.0 or later"
df.loc[df["License"] =="[o] GNU Lesser General Public License v3.0 or later", "License"] = "GNU Lesser General Public License v3.0 or later"

df.loc[df["License"] =="GNU Affero General Public License v3.0 only", "License"] = "GNU Affero General Public License v3.0"
df.loc[df["License"] =="GNU Affero General Public License 3.0", "License"] = "GNU Affero General Public License v3.0"




df.loc[df["License"] =="Alternative Commercial License Available", "License"] = "Proprietary License"
df.loc[df["License"] =="Basic Proprietary Commercial License", "License"] = "Proprietary License"
df.loc[df["License"] =="상용 라이선스", "License"] = "Proprietary License"
df.loc[df["License"] =="ETRI 기술이전 Commercial License", "License"] = "Proprietary License"
df.loc[df["License"] =="SK Holdings License", "License"] = "Proprietary License"


df.loc[df["License"] =="[template] Basic Proprietary Commercial License", "License"] = "Proprietary License"
df.loc[df["License"] =="Purchased_Proprietary_Licensed", "License"] = "Proprietary License"


df.loc[df["License"] ==" Apache License 2.0", "License"] = "Apache License 2.0"
df.loc[df["License"] =="Apache License Version 2.0", "License"] = "Apache License 2.0"
df.loc[df["License"] =="Apache 1.1", "License"] = "Apache License 1.1"

df.loc[df["License"] ==" Eclipse Public License 1.0", "License"] = "Eclipse Public License 1.0"
df.loc[df["License"] =="Eclipse Public License - v 1.0", "License"] = "Eclipse Public License 1.0"

df.loc[df["License"] ==" Mozilla Public License 1.1", "License"] = "Mozilla Public License 1.1"


df.loc[df["License"] =="Eclipse Distribution License - v 1.0", "License"] = "Eclipse Distribution License 1.0"
df.loc[df["License"] =="Common Public License", "License"] = "Common Public License 1.0"
df.loc[df["License"] =="Common Development and Distribution License 1.0 ", "License"] = "Common Development and Distribution License 1.0"
df.loc[df["License"] =="COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0", "License"] = "Common Development and Distribution License 1.0"
df.loc[df["License"] =="Common Development GNU Library General Public License v2 or laterand Distribution (CDDL) 1.1 License", "License"] = "Common Development and Distribution License 1.1"




df.loc[df["License"] =="CC-BY", "License"] = "Creative Commons Attribution 1.0"

df.loc[df["License"] =="CodeProjectOpenLicense", "License"] = "Code Project Open License"
df.loc[df["License"] =="Code Project Open License ", "License"] = "Code Project Open License"


df.loc[df["License"] ==" dom4j License (BSD 2.0 +)", "License"] = "Dom4j License"
df.loc[df["License"] =="dom4j License (BSD 2.0 +)", "License"] = "Dom4j License"

df.loc[df["License"] =="Unspecified?", "License"] = "Unspecified"

df.loc[df["License"] ==" JSON License", "License"] = "JSON License"
df.loc[df["License"] =="PublicDomain", "License"] = "Public Domain"
df.loc[df["License"] ==" ANTLR Software Rights Notice", "License"] = "ANTLR Software Rights Notice"
df.loc[df["License"] =="AFL", "License"] = "Academic Free License 2.0"

df.loc[df["License"] =="curl License", "License"] = "Curl License"
df.loc[df["License"] =="JSon License", "License"] = "JSON License"
df.loc[df["License"] =="Regex License", "License"] = "RegEx License"
df.loc[df["License"] =="Jtidy License", "License"] = "JTidy License"
df.loc[df["License"] =="RealNetworks Public Source License V1.0", "License"] = "RealNetworks Public Source License v1.0"
df.loc[df["License"] =="dTree License", "License"] = "Dtree License"
df.loc[df["License"] =="Microsoft Public License (Ms-PL)", "License"] = "Microsoft Public License"







In [10]:
df.loc[df["Component"] == "Google Gson library","License"] = "Apache License 2.0"
df.loc[df["Component"] == "original_general_use","License"] = "Unspecified"
df.loc[df["Component"] == "Unspecified","License"] = "unknown"

In [11]:
# Usage
df.loc[df["Usage"] =="component", "Usage"] = "Component"
df.loc[df["Usage"] =="[component_separate_work]", "Usage"] = "Component"
df.loc[df["Usage"] =="Component (Separate Work)", "Usage"] = "Component"
df.loc[df["Usage"] =="Full, Partial", "Usage"] = "Component"
df.loc[df["Usage"] =="Partial, Full", "Usage"] = "Component"
df.loc[df["Usage"] =="Snippet (+ File) (Declared)", "Usage"] = "Component"
df.loc[df["Usage"] =="Snippet (+ File, Component)", "Usage"] = "Component"
df.loc[df["Usage"] =="Snippet (+ File)", "Usage"] = "Component"
df.loc[df["Usage"] =="Separate Work", "Usage"] = "Component"
df.loc[df["Usage"] =="Snippet (+ Component)", "Usage"] = "Component"
df.loc[df["Usage"] =="Snippet + Component", "Usage"] = "Component"
df.loc[df["Usage"] =="Non-distribution Code", "Usage"] = "Component"



df.loc[df["Usage"] =="[development_tool]", "Usage"] = "File"
df.loc[df["Usage"] =="DEVELOPMENT_TOOL", "Usage"] = "File"

df.loc[df["Usage"] =="[component_dynamic_library]", "Usage"] = "File"
df.loc[df["Usage"] =="Component (Dynamic Library)", "Usage"] = "File"
df.loc[df["Usage"] =="Component (+ Dynamic Library (per LGPL))", "Usage"] = "File"
df.loc[df["Usage"] =="Snippet (+ File, Dynamic Library (per LGPL))", "Usage"] = "File"
df.loc[df["Usage"] =="Snippet (+ File, Component, Dynamic Library (per LGPL))", "Usage"] = "File"

df.loc[df["Usage"] =="Dynamic Library", "Usage"] = "File"
df.loc[df["Usage"] =="Dynamic  Library", "Usage"] = "File"
df.loc[df["Usage"] =="Library Link", "Usage"] = "File"

df.loc[df["Usage"] =="PREREQUISITE_DYNAMIC_LIBRARY", "Usage"] = "File"
df.loc[df["Usage"] =="[prerequisite_dynamic_library]", "Usage"] = "File"
df.loc[df["Usage"] =="Other (Development Tool)", "Usage"] = "File"
df.loc[df["Usage"] =="ㅣㅑ", "Usage"] = "File"
df.loc[df["Usage"] =="File (+ File)", "Usage"] = "File"
df.loc[df["Usage"] =="Component (Module)", "Usage"] = "File"
df.loc[df["Usage"] =="File (+ Component, Dynamic Library (per LGPL))", "Usage"] = "File"
df.loc[df["Usage"] =="File (+ Dynamic Library (per LGPL))", "Usage"] = "File"

df.loc[df["Usage"] =="File (+ Component)", "Usage"] = "File"

df.loc[df["Usage"] =="Full", "Usage"] = "File"

df.loc[df["Usage"].isnull() & (df["License Conflict"] =="충돌없음"), "Usage"] = "File"


df.loc[df["Usage"] =="Partial", "Usage"] = "Snippet"


df.loc[df["Usage"].isnull() & (df["License Conflict"] =="프로젝트명"), "Usage"] = "Original Code"


In [12]:
df.loc[df["Code Match"].isnull() ==True, "Code Match"] = "1" 

In [13]:
df["Code Match"] = df["Code Match"].str.replace(',','').fillna(0).astype(float).astype(int)

In [14]:
df.loc[(df["License Conflict"] =="충돌없음") & (df["Code Match"] == 0 ), "Code Match"] = 1

In [15]:
df.loc[df["project license"].str.contains("SK Holdings License|Proprietary Commercial License|Proprietary License|Unspecified|ETRI 기술이전 Commercial License|Basic Proprietary Commercial License"), "project license"] = "[template] Basic Proprietary Commercial License"
df.loc[df["project license"].str.contains("GPL 3.0"), "project license"] = "GNU General Public License v3.0 or later"
df.loc[df["project license"].str.contains("GPL 2.0|GPL"), "project license"] = "GNU General Public License v2.0 or later"
df.loc[df["project license"] == "[o] GNU Lesser General Public License v3.0 or later", "project license"] = "GNU Lesser General Public License v3.0 or later"
df.loc[df["project license"].str.contains("LGPL 2.1"), "project license"] = "GNU Lesser General Public License v2.1 or later"

df.loc[df["project license"] == "BSD 2-clause \"Simplified\" License", "project license"] = "BSD 2-clause Simplified License"
df.loc[df["project license"] == "BSD 2-clause \'Simplified\' License", "project license"] = "BSD 2-clause Simplified License"
df.loc[df["project license"] == "BSD 3-clause \'New\' or \'Revised\' License", "project license"] = "BSD 3-clause New or Revised License"
df.loc[df["project license"] == "BSD 3-clause \"New\" or \"Revised\" License", "project license"] = "BSD 3-clause New or Revised License"

df.loc[df["project license"] == "BSD 2.0", "project license"] = "BSD 2-clause Simplified License"
df.loc[df["project license"] == "BSD 4-clause \"Original\" or \"Old\" License", "project license"] = "BSD 4-clause Original or Old License"

df.loc[df["project license"] == "Apache License Version 2.0", "project license"] = "Apache License 2.0"
df.loc[df["project license"] =="GNU Library General Public License v2 or later", "project license"] = "GNU Library General Public License v2.0 or later"

df.loc[df["License"] =="GNU Affero General Public License v3.0", "License"] = "Affero GPL"


In [16]:
# classifying copyleft
license_cat = df.groupby("License Conflict")["License"].unique().to_frame().reset_index()
license_con_3 = license_cat.loc[license_cat["License Conflict"] == "프로젝트에 선언된 라이선스와 충돌", "License"]
license_pro = license_con_3.loc[3]
license_pro

df.loc[df["License"].isin(license_pro) & (df["License Conflict"] == "프로젝트에 선언된 라이선스와 충돌")].head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Project Index,License Conflict,Component,Version,Home Page,License,Usage,Code Match,year,service,customer,project license,file
2,2,2,2,프로젝트에 선언된 라이선스와 충돌,Network Sending Packet Generator,nspg 0.5.0,http://sourceforge.net/projects/nspg/,GNU General Public License v2.0 or later,Snippet,1,2013,KOSSA_Dev_Competition,KOSSA,[template] Basic Proprietary Commercial License,ossdev2013_15_NetScanV3Report.xlsx
7,7,7,0,프로젝트에 선언된 라이선스와 충돌,A52DEC,Unspecified,http://liba52.sourceforge.net/,GNU General Public License v2.0 or later,Component,75,2013,KOSSA_Dev_Competition,KOSSA,[template] Basic Proprietary Commercial License,ossdev2013_18Report.xlsx
9,9,9,2,프로젝트에 선언된 라이선스와 충돌,avcap,Unspecified,http://sourceforge.net/projects/libavcap/,GNU General Public License v3.0 or later,Component,161,2013,KOSSA_Dev_Competition,KOSSA,[template] Basic Proprietary Commercial License,ossdev2013_18Report.xlsx
10,10,10,3,프로젝트에 선언된 라이선스와 충돌,FFmpeg,Unspecified,http://ffmpeg.sourceforge.net/,GNU Lesser General Public License v2.1 or later,Component,9262,2013,KOSSA_Dev_Competition,KOSSA,[template] Basic Proprietary Commercial License,ossdev2013_18Report.xlsx
12,12,12,5,프로젝트에 선언된 라이선스와 충돌,Freeware Advanced Audio Coder - faad2-src,Unspecified,http://sourceforge.net/projects/faac/,GNU General Public License v2.0 or later,Component,120,2013,KOSSA_Dev_Competition,KOSSA,[template] Basic Proprietary Commercial License,ossdev2013_18Report.xlsx


In [17]:
df.loc[ (df["project license"] == "[template] Basic Proprietary Commercial License") &
        (df["License Conflict"] == "프로젝트에 선언된 라이선스와 충돌"), "License"].value_counts().head()

GNU Lesser General Public License v2.1 or later    2082
GNU General Public License v2.0 or later           1671
Eclipse Public License 1.0                          999
GNU Lesser General Public License v3.0 or later     365
Sun GPL With Classpath Exception v2.0               351
Name: License, dtype: int64

In [18]:
# 통계를 위한 전처리
df["License Modified"] = df["License"]

df.loc[df["License Modified"] =="MIT License", "License Modified"] = "MIT"
df.loc[df["License Modified"] =="BSD 4-clause Original or Old License", "License Modified"] = "BSD 4.0"
df.loc[df["License Modified"] =="BSD 3-clause New or Revised License", "License Modified"] = "BSD 3.0"
df.loc[df["License Modified"] =="BSD 2-clause Simplified License", "License Modified"] = "BSD 2.0"
df.loc[df["License Modified"] =="Apache License 2.0", "License Modified"] = "Apache 2.0"
df.loc[df["License Modified"] =="Apache License 1.1", "License Modified"] = "Apache 1.1"
df.loc[df["License Modified"] =="ISC License", "License Modified"] = "ISC"
df.loc[df["License Modified"] =="Public Domain", "License Modified"] = "PD"
df.loc[df["License Modified"] =="zlib License", "License Modified"] = "zlib"
df.loc[df["License Modified"] =="Code Project Open License 1.02", "License Modified"] = "CPOL 1.02"
df.loc[df["License Modified"] =="Code Project Open 1.02 License", "License Modified"] = "CPOL 1.02"


df.loc[df["License Modified"] =="OpenSSL Combined License", "License Modified"] = "OpenSSL"
df.loc[df["License Modified"] =="Jdom License", "License Modified"] = "Jdom"
df.loc[df["License Modified"] =="Dom4j License", "License Modified"] = "Dom4j"
df.loc[df["License Modified"] =="Microsoft Public License", "License Modified"] = "MSPL"
df.loc[df["License Modified"] =="Apple MIT License", "License Modified"] = "Apple MIT"

df.loc[df["License Modified"] =="Proprietary Commercial License", "License Modified"] = "Commercial"
df.loc[df["License Modified"] =="Proprietary License", "License Modified"] = "Commercial"
df.loc[df["License Modified"] =="Commercial License", "License Modified"] = "Commercial"
df.loc[df["License Modified"] =="Commercial License (Purchased)", "License Modified"] = "Commercial"
df.loc[df["License Modified"] =="MySQL Commercial License", "License Modified"] = "Commercial"
df.loc[df["License Modified"] =="EmguCV Commercial License", "License Modified"] = "Commercial"
df.loc[df["License Modified"] =="Ext JS Commercial License", "License Modified"] = "Commercial"


df.loc[df["License Modified"] =="Oracle Technology Network Development and Distribution License", "License Modified"] = "Oracle"
df.loc[df["License Modified"] =="Oracle Technology Network Development and Distribution 01-2009 License", "License Modified"] = "Oracle"
df.loc[df["License Modified"] =="Oracle Technology Network Development and Distribution 01-2008 License", "License Modified"] = "Oracle"
df.loc[df["License Modified"] =="Oracle Binary Code JRE and JDK 7 License", "License Modified"] = "Oracle"
df.loc[df["License Modified"] =="Oracle Berkeley DB License", "License Modified"] = "Oracle"
df.loc[df["License Modified"] =="Oracle Java SE and JavaFX License", "License Modified"] = "Oracle"
df.loc[df["License Modified"] =="Oracle Technology Network Developer Terms License", "License Modified"] = "Oracle"

df.loc[df["License Modified"] =="Oracle Technology Network JDeveloper License", "License Modified"] = "Oracle"
df.loc[df["License Modified"] =="Oracle Technology Network Development and Distribution 11/2016 License", "License Modified"] = "Oracle"
df.loc[df["License Modified"] =="Oracle Binary Code License for Java EE Technologies", "License Modified"] = "Oracle"
df.loc[df["License Modified"] =="Oracle Berkeley DB Java Edition License", "License Modified"] = "Oracle"
df.loc[df["License Modified"] =="Oracle SQL Developer License", "License Modified"] = "Oracle"
df.loc[df["License Modified"] =="Oracle Technology Network Java EE 6 SDK License", "License Modified"] = "Oracle"
df.loc[df["License Modified"] =="Oracle BCL for Java SE Platform Products and JavaFX 2017 Restricted", "License Modified"] = "Oracle"

df.loc[df["License Modified"] =="Oracle", "License Modified"] = "Commercial"

df.loc[df["License Modified"] =="GNU General Public License v3.0 or later", "License Modified"] = "GPL 3.0"
df.loc[df["License Modified"] =="GNU General Public License v3.0 w/Autoconf exception", "License Modified"] = "GPL 3.0"
df.loc[df["License Modified"] =="GNU General Public License v3.0 or later with Bison exception 2.2", "License Modified"] = "GPL 3.0"
df.loc[df["License Modified"] =="GPL 3.0 with Classpath Exception", "License Modified"] = "GPL 3.0"
df.loc[df["License Modified"] =="GPL 2.0 with OpenSSL Exception", "License Modified"] = "GPL 3.0"
df.loc[df["License Modified"] =="Ext Exception to GPL 3.0 for Applications", "License Modified"] = "GPL 3.0"

df.loc[df["License Modified"] =="GNU General Public License v3.0 or later (GCC Exception)", "License Modified"] = "GPL 3.0"


df.loc[df["License Modified"] =="GNU General Public License v3.0 only", "License Modified"] = "GPL 3.0"



df.loc[df["License Modified"] =="GNU General Public License v2.0 or later", "License Modified"] = "GPL 2.0"
df.loc[df["License Modified"] =="Sun GPL With Classpath Exception v2.0", "License Modified"] = "GPL 2.0"
df.loc[df["License Modified"] =="GNU General Public License v2.0 w/Autoconf exception", "License Modified"] = "GPL 2.0"
df.loc[df["License Modified"] =="GNU General Public License v2.0 w/GCC Runtime Library exception", "License Modified"] = "GPL 2.0"
df.loc[df["License Modified"] =="GNU General Public License with FLOSS Exception v2.0", "License Modified"] = "GPL 2.0"
df.loc[df["License Modified"] =="GNU General Public License v2.0 w/Classpath exception", "License Modified"] = "GPL 2.0"
df.loc[df["License Modified"] =="GNU General Public License v2.0 only/ FOSS License Exception", "License Modified"] = "GPL 2.0"



df.loc[df["License Modified"] =="Bison GPL 2.0 with Exception", "License Modified"] = "GPL 2.0"
df.loc[df["License Modified"] =="Autoconf GPL 2.0 with Exception", "License Modified"] = "GPL 2.0"
df.loc[df["License Modified"] =="VirtualBox Exception to GPL 2.0 only", "License Modified"] = "GPL 2.0"
df.loc[df["License Modified"] =="GStreamer exception to GPL 2.0 or later", "License Modified"] = "GPL 2.0"
df.loc[df["License Modified"] =="GCC GPL 2.0 with inclusion or link exception", "License Modified"] = "GPL 2.0"
df.loc[df["License Modified"] =="tinylogin GPL2.0 License", "License Modified"] = "GPL 2.0"
df.loc[df["License Modified"] =="FreeRTOS Modified GPL License", "License Modified"] = "GPL 2.0"
df.loc[df["License Modified"] =="Nmap Exception to GNU General Public License v2.0 only", "License Modified"] = "GPL 2.0"

df.loc[df["License Modified"].str.contains("GNU Lesser General Public License 2.1 or later"), "License Modified"] ="LGPL 2.1"
df.loc[df["License Modified"].str.contains("GNU Lesser General Public License 3.0 or later"), "License Modified"] ="LGPL 3.0"

df.loc[df["License Modified"] =="GNU General Public License v2.0 only", "License Modified"] = "GPL 2.0"

df.loc[df["License Modified"] =="GNU General Public License v1.0 or later", "License Modified"] = "GPL 1.0"
df.loc[df["License Modified"] =="GNU General Public License v1.0 only", "License Modified"] = "GPL 1.0"


df.loc[df["License Modified"] =="GNU Lesser General Public License v3.0 or later", "License Modified"] = "LGPL 3.0"
df.loc[df["License Modified"] =="GNU Lesser General Public License v3.0 only", "License Modified"] = "LGPL 3.0"

df.loc[df["License Modified"] =="GNU Lesser General Public License v2.1 or later", "License Modified"] = "LGPL 2.1"
df.loc[df["License Modified"] =="GNU Lesser General Public License v2.1 only", "License Modified"] = "LGPL 2.1"

df.loc[df["License Modified"] =="GNU Lesser General Public License v2.0 or later", "License Modified"] = "LGPL 2.1"
df.loc[df["License Modified"] =="GNU Library General Public License v2.0 or later", "License Modified"] = "LGPL 2.1"
df.loc[df["License Modified"] =="GNU Library General Public License v2 only", "License Modified"] = "LGPL 2.1"
df.loc[df["License Modified"] =="RXTX Exception to LGPL 2.1 License", "License Modified"] = "LGPL 2.1"
df.loc[df["License Modified"] =="Tracetool Exception to LGPL 2.1", "License Modified"] = "LGPL 2.1"
df.loc[df["License Modified"] =="Nokia Qt LGPL 2.1 Exception 1.0 License", "License Modified"] = "LGPL 2.1"
df.loc[df["License Modified"] =="Libwebsockets LGPL 2.1 with static linking exception License", "License Modified"] = "LGPL 2.1"


df.loc[df["License Modified"] =="Creative Commons Attribution Non Commercial 3.0", "License Modified"] = "CCBY"
df.loc[df["License Modified"] =="Creative Commons Attribution Share Alike 2.5", "License Modified"] = "CCBY"
df.loc[df["License Modified"] =="Creative Commons Attribution Non Commercial 3.0 Unported", "License Modified"] = "CCBY"
df.loc[df["License Modified"] =="Creative Commons Attribution Non Commercial 4.0", "License Modified"] = "CC BY"
df.loc[df["License Modified"] =="Creative Commons Attribution Non Commercial Share Alike 2.0", "License Modified"] = "CCBY"
df.loc[df["License Modified"] =="Creative Commons Attribution-NonCommercial-ShareAlike 3.0 License", "License Modified"] = "CCBY"
df.loc[df["License Modified"] =="Creative Commons Attribution Non Commercial 2.0", "License Modified"] = "CCBY"
df.loc[df["License Modified"] =="Creative Commons Attribution Non Commercial Share Alike 3.0", "License Modified"] = "CCBY"

df.loc[df["License Modified"] =="Eclipse Public License 1.0", "License Modified"] = "EPL 1.0"
df.loc[df["License Modified"] =="Eclipse Public License 2.0", "License Modified"] = "EPL 2.0"

df.loc[df["License Modified"] =="Common Public License 1.0", "License Modified"] = "CPL 1.0"
df.loc[df["License Modified"] =="Mozilla Public License 1.0", "License Modified"] = "MPL 1.0"
df.loc[df["License Modified"] =="Mozilla Public License 1.1", "License Modified"] = "MPL 1.1"
df.loc[df["License Modified"] =="Mozilla Public License 2.0", "License Modified"] = "MPL 2.0"

df.loc[df["License Modified"] =="Code Project Open License", "License Modified"] = "CPOL"
df.loc[df["License Modified"] =="Code Project Open License 1.02", "License Modified"] = "CPOL 1.02"

df.loc[df["License Modified"] =="Common Development and Distribution License 1.0", "License Modified"] = "CDDL 1.0"
df.loc[df["License Modified"] =="Common Development and Distribution License 1.1", "License Modified"] = "CDDL 1.1"


In [19]:
df["License Modified"].value_counts()

Apache 2.0                                                                                              34960
MIT                                                                                                     27957
Unspecified                                                                                              9325
BSD 3.0                                                                                                  5018
GPL 2.0                                                                                                  3277
Commercial                                                                                               3228
LGPL 2.1                                                                                                 2754
ISC                                                                                                      1212
EPL 1.0                                                                                                  1204
BSD 2.0   

In [20]:
temp_license = df.loc[ (df["project license"] == "[template] Basic Proprietary Commercial License") &
        (df["License Conflict"] == "프로젝트에 선언된 라이선스와 충돌"), ["License Conflict","License Modified", "project license"]]

In [21]:
filtered_copyleft = list(temp_license["License Modified"].value_counts().index)
print(len(filtered_copyleft))

63


In [22]:


# 이거 수정
ex_licenses = ["BSD 3.0", "BSD 2.0", 
               "CPOL", "CPOL 1.02", 
               "Apache 2.0", "MIT", "zlib", "Python Software Foundation License 2.3", "Commercial"]
for i in ex_licenses:
    if i in filtered_copyleft:
        filtered_copyleft.remove(i)

        
filtered_copyleft = str(filtered_copyleft).replace(", ", "|").replace("\'", "").replace("[","").replace("]","")
print(filtered_copyleft[:100])

GPL 2.0|LGPL 2.1|EPL 1.0|LGPL 3.0|GPL 3.0|CPL 1.0|CCBY|MPL 1.1|Affero GPL|Sun License for J2SDK|MPL 


In [23]:
print(filtered_copyleft)

GPL 2.0|LGPL 2.1|EPL 1.0|LGPL 3.0|GPL 3.0|CPL 1.0|CCBY|MPL 1.1|Affero GPL|Sun License for J2SDK|MPL 2.0|Artistic License 1.0|GPL 1.0|SharpZipLib GPL License (GPL w/exception)|CDDL 1.1|Unknown License|Apple Public Source License 2.0|MPL 1.0|CC BY|Academic Free License v2.1|InterBase Public License v1|Unix Network Programming Book License|Treeview Freeware License|Sun GPL With Classpath Exception (GPL+)|com.oreilly.servlet License|Hans Dietrich Public Domain License|Sun Community Source License 3.0|wxWindows Library License|Mibble GPL License (GPL w/exception)|SIL Open Font License 1.1|Microsoft Reciprocal License|Numerical Recipes Personal Single-User License|Common Public 0.5 License|JIMMY BRUSH|GCC Runtime Library exception 3.1|AmCharts Linkware License|Netscape Public License v1.1|eCos license version 2.0|Fraunhofer FDK AAC Codec Library for Android License|Honest Public License|Sleepycat License|CDDL 1.0|"OReilly Fair Use License"|CeCILL-C Free Software License Agreement|Apple Publi

In [24]:
df.loc[df["License Modified"].str.contains(filtered_copyleft), "iscopyleft"] = "Copyleft"
df.loc[df["License Modified"] == "Commercial", "iscopyleft"] = "Commercial"
df.loc[(df["iscopyleft"] != "Copyleft") & (df["License Modified"] != "Commercial"), "iscopyleft"] = "Permissive"



  """Entry point for launching an IPython kernel.


In [25]:
df["iscopyleft"].unique()

array(['Permissive', 'Copyleft', 'Commercial'], dtype=object)

In [26]:
df.loc[df["iscopyleft"] == "Copyleft", "License Modified"].value_counts()

GPL 2.0                                                                       3277
LGPL 2.1                                                                      2754
EPL 1.0                                                                       1204
GPL 3.0                                                                        627
LGPL 3.0                                                                       499
CDDL 1.0                                                                       465
CPL 1.0                                                                        388
MPL 1.1                                                                        325
CDDL 1.1                                                                       240
SIL Open Font License 1.1                                                      220
CCBY                                                                           129
Sun JavaBeans Activation Framework 1.1 License for non-redistributables        105
MPL 

In [27]:
import datetime
# data backup
lastday = "data/total_data_preprocessed_" + str(datetime.date.today()) + ".csv"
df.to_csv(lastday, encoding="cp949")

In [28]:
df.loc[df["year"] == 2013, "iscopyleft"].value_counts()

Permissive    5466
Copyleft      1439
Commercial     287
Name: iscopyleft, dtype: int64

In [29]:
df.loc[df["year"] == 2016, "Component"].count()

19020

In [30]:
df.loc[(df["year"] == 2016) & (df["License Conflict"]=="프로젝트명"), "Component"].count()

461

Series([], Name: License Modified, dtype: object)